lib/banzai/filter/markdown_post_escape_filter.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

# frozen_string_literal: true

module Banzai
  module Filter
    # See comments in MarkdownPreEscapeFilter for details on strategy
    class MarkdownPostEscapeFilter < HTML::Pipeline::Filter
      LITERAL_KEYWORD   = MarkdownPreEscapeFilter::LITERAL_KEYWORD
      LITERAL_REGEX     = %r{#{LITERAL_KEYWORD}-(.*?)-#{LITERAL_KEYWORD}}.freeze
      NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze
      SPAN_REGEX        = %r{<span>(.*?)</span>}.freeze

      XPATH_A         = Gitlab::Utils::Nokogiri.css_to_xpath('a').freeze
      XPATH_LANG_TAG  = Gitlab::Utils::Nokogiri.css_to_xpath('pre').freeze
      XPATH_CODE_SPAN = Gitlab::Utils::Nokogiri.css_to_xpath('code > span').freeze

      def call
        return doc unless result[:escaped_literals]

        new_html = unescaped_literals(doc.to_html)
        new_html = add_spans(new_html)

        @doc = parse_html(new_html)

        remove_spans_in_certain_attributes
        remove_spans_in_code

        doc
      end

      private

      # For any literals that actually didn't get escape processed
      # (for example in code blocks), remove the special sequence.
      def unescaped_literals(html)
        html.gsub!(NOT_LITERAL_REGEX) do |match|
          last_match = ::Regexp.last_match(1)
          last_match_token = last_match.sub('%5C', '\\')

          escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token }
          escaped_char = escaped_item ? escaped_item[:escaped] : last_match

          escaped_char = escaped_char.sub('\\', '%5C') if last_match.start_with?('%5C')

          escaped_char
        end

        html
      end

      # Replace any left over literal sequences with `span` so that our
      # reference processing is short-circuited
      def add_spans(html)
        html.gsub!(LITERAL_REGEX) do |match|
          last_match = ::Regexp.last_match(1)
          last_match_token = "\\#{last_match}"

          escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:token] == last_match_token }
          escaped_char = escaped_item ? escaped_item[:char] : ::Regexp.last_match(1)

          "<span>#{escaped_char}</span>"
        end

        html
      end

      # Since literals are converted in links, we need to remove any surrounding `span`.
      def remove_spans_in_certain_attributes
        doc.xpath(XPATH_A).each do |node|
          node.attributes['href'].value  = node.attributes['href'].value.gsub(SPAN_REGEX, '\1') if node.attributes['href']
          node.attributes['title'].value = node.attributes['title'].value.gsub(SPAN_REGEX, '\1') if node.attributes['title']
        end

        doc.xpath(XPATH_LANG_TAG).each do |node|
          node.attributes['lang'].value  = node.attributes['lang'].value.gsub(SPAN_REGEX, '\1') if node.attributes['lang']
        end
      end

      # Any `<span>` that makes it into a `<code>` element is from the math processing,
      # convert back to the escaped character, such as `\$`
      def remove_spans_in_code
        doc.xpath(XPATH_CODE_SPAN).each do |node|
          escaped_item = Banzai::Filter::MarkdownPreEscapeFilter::ESCAPABLE_CHARS.find { |item| item[:char] == node.content && item[:latex] }

          node.replace(escaped_item[:escaped]) if escaped_item
        end
      end
    end
  end
end