summaryrefslogtreecommitdiff
path: root/lib/banzai/filter/ascii_doc_sanitization_filter.rb
blob: 9105e86ad04d57b883d5bf262a50fc1829f51e5c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# frozen_string_literal: true

module Banzai
  module Filter
    # Sanitize HTML produced by AsciiDoc/Asciidoctor.
    #
    # Extends Banzai::Filter::BaseSanitizationFilter with specific rules.
    class AsciiDocSanitizationFilter < Banzai::Filter::BaseSanitizationFilter
      # Section anchor link pattern
      SECTION_LINK_REF_PATTERN = /\A#{Gitlab::Asciidoc::DEFAULT_ADOC_ATTRS['idprefix']}(:?[[:alnum:]]|-|_)+\z/.freeze
      SECTION_HEADINGS = %w(h2 h3 h4 h5 h6).freeze

      # Footnote link patterns
      FOOTNOTE_LINK_ID_PATTERNS = {
        a: /\A_footnoteref_\d+\z/,
        div: /\A_footnotedef_\d+\z/
      }.freeze

      # Classes used by Asciidoctor to style components
      ADMONITION_CLASSES = %w(fa icon-note icon-tip icon-warning icon-caution icon-important).freeze
      CALLOUT_CLASSES = ['conum'].freeze
      CHECKLIST_CLASSES = %w(fa fa-check-square-o fa-square-o).freeze
      LIST_CLASSES = %w(checklist none no-bullet unnumbered unstyled).freeze

      ELEMENT_CLASSES_WHITELIST = {
        span: %w(big small underline overline line-through).freeze,
        div: ['admonitionblock'].freeze,
        td: ['icon'].freeze,
        i: ADMONITION_CLASSES + CALLOUT_CLASSES + CHECKLIST_CLASSES,
        ul: LIST_CLASSES,
        ol: LIST_CLASSES,
        a: ['anchor'].freeze
      }.freeze

      def customize_whitelist(whitelist)
        # Allow marks
        whitelist[:elements].push('mark')

        # Allow any classes in `span`, `i`, `div`, `td`, `ul`, `ol` and `a` elements
        # but then remove any unknown classes
        whitelist[:attributes]['span'] = %w(class)
        whitelist[:attributes]['div'].push('class')
        whitelist[:attributes]['td'] = %w(class)
        whitelist[:attributes]['i'] = %w(class)
        whitelist[:attributes]['ul'] = %w(class)
        whitelist[:attributes]['ol'] = %w(class)
        whitelist[:attributes]['a'].push('class')
        whitelist[:transformers].push(self.class.remove_element_classes)

        # Allow `id` in heading elements for section anchors
        SECTION_HEADINGS.each do |header|
          whitelist[:attributes][header] = %w(id)
        end
        whitelist[:transformers].push(self.class.remove_non_heading_ids)

        # Allow `id` in footnote elements
        FOOTNOTE_LINK_ID_PATTERNS.keys.each do |element|
          whitelist[:attributes][element.to_s].push('id')
        end
        whitelist[:transformers].push(self.class.remove_non_footnote_ids)

        whitelist
      end

      class << self
        def remove_non_footnote_ids
          lambda do |env|
            node = env[:node]

            return unless (pattern = FOOTNOTE_LINK_ID_PATTERNS[node.name.to_sym])
            return unless node.has_attribute?('id')

            return if node['id'] =~ pattern

            node.remove_attribute('id')
          end
        end

        def remove_non_heading_ids
          lambda do |env|
            node = env[:node]

            return unless SECTION_HEADINGS.any?(node.name)
            return unless node.has_attribute?('id')

            return if node['id'] =~ SECTION_LINK_REF_PATTERN

            node.remove_attribute('id')
          end
        end

        def remove_element_classes
          lambda do |env|
            node = env[:node]

            return unless (classes_whitelist = ELEMENT_CLASSES_WHITELIST[node.name.to_sym])
            return unless node.has_attribute?('class')

            classes = node['class'].strip.split(' ')
            allowed_classes = (classes & classes_whitelist)
            if allowed_classes.empty?
              node.remove_attribute('class')
            else
              node['class'] = allowed_classes.join(' ')
            end
          end
        end
      end
    end
  end
end