1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
# frozen_string_literal: true
module Banzai
module Filter
# HTML Filter to modify the attributes of external links
class ExternalLinkFilter < HTML::Pipeline::Filter
SCHEMES = ['http', 'https', nil].freeze
RTLO = "\u202E"
ENCODED_RTLO = '%E2%80%AE'
def call
links.each do |node|
# URI.parse does stricter checking on the url than Addressable,
# such as on `mailto:` links. Since we've been using it, do an
# initial parse for validity and then use Addressable
# for IDN support, etc
uri = uri_strict(node_src(node))
if uri
node.set_attribute(node_src_attribute(node), uri.to_s)
addressable_uri = addressable_uri(node_src(node))
else
addressable_uri = nil
end
unless internal_url?(addressable_uri)
punycode_autolink_node!(addressable_uri, node)
sanitize_link_text!(node)
add_malicious_tooltip!(addressable_uri, node)
add_nofollow!(addressable_uri, node)
end
end
doc
end
private
# if this is a link to a proxied image, then `src` is already the correct
# proxied url, so work with the `data-canonical-src`
def node_src_attribute(node)
node['data-canonical-src'] ? 'data-canonical-src' : 'href'
end
def node_src(node)
node[node_src_attribute(node)]
end
def uri_strict(href)
URI.parse(href)
rescue URI::Error
nil
end
def addressable_uri(href)
Addressable::URI.parse(href)
rescue Addressable::URI::InvalidURIError
nil
end
def links
query = 'descendant-or-self::a[@href and not(@href = "")]'
doc.xpath(query)
end
def internal_url?(uri)
return false if uri.nil?
# Relative URLs miss a hostname AND a scheme
return true if !uri.hostname && !uri.scheme
uri.hostname == internal_url.hostname
end
def internal_url
@internal_url ||= URI.parse(Gitlab.config.gitlab.url)
end
# Only replace an autolink with an IDN with it's punycode
# version if we need emailable links. Otherwise let it
# be shown normally and the tooltips will show the
# punycode version.
def punycode_autolink_node!(uri, node)
return unless uri
return unless context[:emailable_links]
unencoded_uri_str = Addressable::URI.unencode(node_src(node))
if unencoded_uri_str == node.content && idn?(uri)
node.content = uri.normalize
end
end
# escape any right-to-left (RTLO) characters in link text
def sanitize_link_text!(node)
node.inner_html = node.inner_html.gsub(RTLO, ENCODED_RTLO)
end
# If the domain is an international domain name (IDN),
# let's expose with a tooltip in case it's intended
# to be malicious. This is particularly useful for links
# where the link text is not the same as the actual link.
# We will continue to show the unicode version of the domain
# in autolinked link text, which could contain emojis, etc.
#
# Also show the tooltip if the url contains the RTLO character,
# as this is an indicator of a malicious link
def add_malicious_tooltip!(uri, node)
if idn?(uri) || has_encoded_rtlo?(uri)
node.add_class('has-tooltip')
node.set_attribute('title', uri.normalize)
end
end
def add_nofollow!(uri, node)
if SCHEMES.include?(uri&.scheme)
license = true if node.attribute('rel')&.value == 'license'
node.set_attribute('rel', 'nofollow noreferrer noopener')
node.kwattr_append('rel', 'license') if license
node.set_attribute('target', '_blank')
end
end
def idn?(uri)
uri&.normalized_host&.start_with?('xn--')
end
def has_encoded_rtlo?(uri)
uri&.to_s&.include?(ENCODED_RTLO)
end
end
end
end
|