lib/banzai/filter/spaced_link_filter.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

# frozen_string_literal: true

require 'uri'

module Banzai
  module Filter
    # HTML Filter for markdown links with spaces in the URLs
    #
    # Based on Banzai::Filter::AutolinkFilter
    #
    # CommonMark does not allow spaces in the url portion of a link/url.
    # For example, `[example](page slug)` is not valid.
    # Neither is `![example](test image.jpg)`. However, particularly
    # in our wikis, we support (via RedCarpet) this type of link, allowing
    # wiki pages to be easily linked by their title.  This filter adds that functionality.
    #
    # This is a small extension to the CommonMark spec.  If they start allowing
    # spaces in urls, we could then remove this filter.
    #
    class SpacedLinkFilter < HTML::Pipeline::Filter
      include ActionView::Helpers::TagHelper

      # Pattern to match a standard markdown link
      #
      # Rubular: http://rubular.com/r/2EXEQ49rg5
      LINK_OR_IMAGE_PATTERN = %r{
        (?<preview_operator>!)?
        \[(?<text>.+?)\]
        \(
          (?<new_link>.+?)
          (?<title>\ ".+?")?
        \)
      }x

      # Text matching LINK_OR_IMAGE_PATTERN inside these elements will not be linked
      IGNORE_PARENTS = %w(a code kbd pre script style).to_set

      # The XPath query to use for finding text nodes to parse.
      TEXT_QUERY = %Q(descendant-or-self::text()[
        not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
        and contains(., ']\(')
      ]).freeze

      def call
        return doc if context[:markdown_engine] == :redcarpet

        doc.xpath(TEXT_QUERY).each do |node|
          content = node.to_html

          next unless content.match(LINK_OR_IMAGE_PATTERN)

          html = spaced_link_filter(content)

          next if html == content

          node.replace(html)
        end

        doc
      end

      private

      def spaced_link_match(link)
        match = LINK_OR_IMAGE_PATTERN.match(link)
        return link unless match

        # escape the spaces in the url so that it's a valid markdown link,
        # then run it through the markdown processor again, let it do its magic
        html = Banzai::Filter::MarkdownFilter.call(transform_markdown(match), context)

        # link is wrapped in a <p>, so strip that off
        html.sub('<p>', '').chomp('</p>')
      end

      def spaced_link_filter(text)
        Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_OR_IMAGE_PATTERN) do |link, left:, right:|
          spaced_link_match(link)
        end
      end

      def transform_markdown(match)
        preview_operator, text, new_link, title = process_match(match)

        "#{preview_operator}[#{text}](#{new_link}#{title})"
      end

      def process_match(match)
        [
          match[:preview_operator],
          match[:text],
          match[:new_link].gsub(' ', '%20'),
          match[:title]
        ]
      end
    end
  end
end