1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
# frozen_string_literal: true
module Banzai
module Filter
# In order to allow a user to short-circuit our reference shortcuts
# (such as # or !), the user should be able to escape them, like \#.
# CommonMark supports this, however it removes all information about
# what was actually a literal. In order to short-circuit the reference,
# we must surround backslash escaped ASCII punctuation with a custom sequence.
# This way CommonMark will properly handle the backslash escaped chars
# but we will maintain knowledge (the sequence) that it was a literal.
#
# This processing is also important for the handling of escaped characters
# in LaTeX math. These will need to be converted back into their escaped
# versions if they are detected in math blocks.
#
# We need to surround the character, not just prefix it. It could
# get converted into an entity by CommonMark and we wouldn't know how many
# characters there are. The entire literal needs to be surrounded with
# a `span` tag, which short-circuits our reference processing.
#
# We can't use a custom HTML tag since we could be initially surrounding
# text in an href, and then CommonMark will not be able to parse links
# properly. So we use `cmliteral-` and `-cmliteral`
#
# https://spec.commonmark.org/0.29/#backslash-escapes
#
# This filter does the initial surrounding, and MarkdownPostEscapeFilter
# does the conversion into span tags.
class MarkdownPreEscapeFilter < HTML::Pipeline::TextFilter
# Table of characters that need this special handling. It consists of the
# GitLab special reference characters and special LaTeX characters.
#
# The `token` is used when we do the initial replacement - for example converting
# `\$` into `cmliteral-\+a-cmliteral`. We don't simply replace `\$` with `$`,
# because this can cause difficulties in parsing math blocks that use `$` as a
# delimiter. We also include a character that _can_ be escaped, `\+`. By examining
# the text once it's been passed to markdown, we can determine that `cmliteral-\+a-cmliteral`
# was in a block that markdown did _not_ escape the character, for example an inline
# code block or some other element. In this case, we must convert back to the
# original escaped version, `\$`. However if we detect `cmliteral-+a-cmliteral`,
# then we know markdown considered it an escaped character, and we should replace it
# with the non-escaped version, `$`.
# See the MarkdownPostEscapeFilter for how this is done.
ESCAPABLE_CHARS = [
{ char: '$', escaped: '\$', token: '\+a', reference: true, latex: true },
{ char: '%', escaped: '\%', token: '\+b', reference: true, latex: true },
{ char: '#', escaped: '\#', token: '\+c', reference: true, latex: true },
{ char: '&', escaped: '\&', token: '\+d', reference: true, latex: true },
{ char: '{', escaped: '\{', token: '\+e', reference: false, latex: true },
{ char: '}', escaped: '\}', token: '\+f', reference: false, latex: true },
{ char: '_', escaped: '\_', token: '\+g', reference: false, latex: true },
{ char: '@', escaped: '\@', token: '\+h', reference: true, latex: false },
{ char: '!', escaped: '\!', token: '\+i', reference: true, latex: false },
{ char: '~', escaped: '\~', token: '\+j', reference: true, latex: false },
{ char: '^', escaped: '\^', token: '\+k', reference: true, latex: false }
].freeze
TARGET_CHARS = ESCAPABLE_CHARS.pluck(:char).join.freeze
ASCII_PUNCTUATION = %r{(\\[#{TARGET_CHARS}])}.freeze
LITERAL_KEYWORD = 'cmliteral'
def call
@text.gsub(ASCII_PUNCTUATION) do |match|
# The majority of markdown does not have literals. If none
# are found, we can bypass the post filter
result[:escaped_literals] = true
escaped_item = ESCAPABLE_CHARS.find { |item| item[:escaped] == match }
token = escaped_item ? escaped_item[:token] : match
"#{LITERAL_KEYWORD}-#{token}-#{LITERAL_KEYWORD}"
end
end
end
end
end
|