lib/banzai/filter/sanitization_filter.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

# frozen_string_literal: true

module Banzai
  module Filter
    # Sanitize HTML
    #
    # Extends HTML::Pipeline::SanitizationFilter with a custom whitelist.
    class SanitizationFilter < HTML::Pipeline::SanitizationFilter
      include Gitlab::Utils::StrongMemoize

      UNSAFE_PROTOCOLS        = %w(data javascript vbscript).freeze
      TABLE_ALIGNMENT_PATTERN = /text-align: (?<alignment>center|left|right)/.freeze

      def whitelist
        strong_memoize(:whitelist) do
          customize_whitelist(super.deep_dup)
        end
      end

      private

      def customize_whitelist(whitelist)
        # Allow table alignment; we whitelist specific text-align values in a
        # transformer below
        whitelist[:attributes]['th'] = %w(style)
        whitelist[:attributes]['td'] = %w(style)
        whitelist[:css] = { properties: ['text-align'] }

        # Allow span elements
        whitelist[:elements].push('span')

        # Allow data-math-style attribute in order to support LaTeX formatting
        whitelist[:attributes]['code'] = %w(data-math-style)
        whitelist[:attributes]['pre'] = %w(data-math-style)

        # Allow html5 details/summary elements
        whitelist[:elements].push('details')
        whitelist[:elements].push('summary')

        # Allow abbr elements with title attribute
        whitelist[:elements].push('abbr')
        whitelist[:attributes]['abbr'] = %w(title)

        # Allow the 'data-sourcepos' from CommonMark on all elements
        whitelist[:attributes][:all].push('data-sourcepos')

        # Disallow `name` attribute globally, allow on `a`
        whitelist[:attributes][:all].delete('name')
        whitelist[:attributes]['a'].push('name')

        # Allow any protocol in `a` elements
        # and then remove links with unsafe protocols
        whitelist[:protocols].delete('a')
        whitelist[:transformers].push(self.class.remove_unsafe_links)

        # Remove `rel` attribute from `a` elements
        whitelist[:transformers].push(self.class.remove_rel)

        # Remove any `style` properties not required for table alignment
        whitelist[:transformers].push(self.class.remove_unsafe_table_style)

        # Allow `id` in a and li elements for footnotes
        # and remove any `id` properties not matching for footnotes
        whitelist[:attributes]['a'].push('id')
        whitelist[:attributes]['li'] = %w(id)
        whitelist[:transformers].push(self.class.remove_non_footnote_ids)

        whitelist
      end

      class << self
        def remove_unsafe_links
          lambda do |env|
            node = env[:node]

            return unless node.name == 'a'
            return unless node.has_attribute?('href')

            begin
              node['href'] = node['href'].strip
              uri = Addressable::URI.parse(node['href'])

              return unless uri.scheme

              # Remove all invalid scheme characters before checking against the
              # list of unsafe protocols.
              #
              # See https://tools.ietf.org/html/rfc3986#section-3.1
              scheme = uri.scheme
                .strip
                .downcase
                .gsub(/[^A-Za-z0-9\+\.\-]+/, '')

              node.remove_attribute('href') if UNSAFE_PROTOCOLS.include?(scheme)
            rescue Addressable::URI::InvalidURIError
              node.remove_attribute('href')
            end
          end
        end

        def remove_rel
          lambda do |env|
            if env[:node_name] == 'a'
              env[:node].remove_attribute('rel')
            end
          end
        end

        def remove_unsafe_table_style
          lambda do |env|
            node = env[:node]

            return unless node.name == 'th' || node.name == 'td'
            return unless node.has_attribute?('style')

            if node['style'] =~ TABLE_ALIGNMENT_PATTERN
              node['style'] = "text-align: #{$~[:alignment]}"
            else
              node.remove_attribute('style')
            end
          end
        end

        def remove_non_footnote_ids
          lambda do |env|
            node = env[:node]

            return unless node.name == 'a' || node.name == 'li'
            return unless node.has_attribute?('id')

            return if node.name == 'a' && node['id'] =~ Banzai::Filter::FootnoteFilter::FOOTNOTE_LINK_REFERENCE_PATTERN
            return if node.name == 'li' && node['id'] =~ Banzai::Filter::FootnoteFilter::FOOTNOTE_LI_REFERENCE_PATTERN

            node.remove_attribute('id')
          end
        end
      end
    end
  end
end