summaryrefslogtreecommitdiff
path: root/lib/gitlab/markdown/autolink_filter.rb
blob: 541f1d88ffc356f86cfb9ae97e77d007833ff6bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
require 'html/pipeline/filter'
require 'uri'

module Gitlab
  module Markdown
    # HTML Filter for auto-linking URLs in HTML.
    #
    # Based on HTML::Pipeline::AutolinkFilter
    #
    # Context options:
    #   :autolink  - Boolean, skips all processing done by this filter when false
    #   :link_attr - Hash of attributes for the generated links
    #
    class AutolinkFilter < HTML::Pipeline::Filter
      include ActionView::Helpers::TagHelper

      # Pattern to match text that should be autolinked.
      #
      # A URI scheme begins with a letter and may contain letters, numbers,
      # plus, period and hyphen. Schemes are case-insensitive but we're being
      # picky here and allowing only lowercase for autolinks.
      #
      # See http://en.wikipedia.org/wiki/URI_scheme
      #
      # The negative lookbehind ensures that users can paste a URL followed by a
      # period or comma for punctuation without those characters being included
      # in the generated link.
      #
      # Rubular: http://rubular.com/r/cxjPyZc7Sb
      LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://\S+)(?<!,|\.)}

      # Text matching LINK_PATTERN inside these elements will not be linked
      IGNORE_PARENTS = %w(a code kbd pre script style).to_set

      def call
        return doc if context[:autolink] == false

        rinku_parse
        text_parse
      end

      private

      # Run the text through Rinku as a first pass
      #
      # This will quickly autolink http(s) and ftp links.
      #
      # `@doc` will be re-parsed with the HTML String from Rinku.
      def rinku_parse
        # Convert the options from a Hash to a String that Rinku expects
        options = tag_options(link_options)

        # NOTE: We don't parse email links because it will erroneously match
        # external Commit and CommitRange references.
        #
        # The final argument tells Rinku to link short URLs that don't include a
        # period (e.g., http://localhost:3000/)
        rinku = Rinku.auto_link(html, :urls, options, IGNORE_PARENTS.to_a, 1)

        # Rinku returns a String, so parse it back to a Nokogiri::XML::Document
        # for further processing.
        @doc = parse_html(rinku)
      end

      # Autolinks any text matching LINK_PATTERN that Rinku didn't already
      # replace
      def text_parse
        search_text_nodes(doc).each do |node|
          content = node.to_html

          next if has_ancestor?(node, IGNORE_PARENTS)
          next unless content.match(LINK_PATTERN)

          # If Rinku didn't link this, there's probably a good reason, so we'll
          # skip it too
          next if content.start_with?(*%w(http https ftp))

          html = autolink_filter(content)

          next if html == content

          node.replace(html)
        end

        doc
      end

      def autolink_filter(text)
        text.gsub(LINK_PATTERN) do |match|
          # Remove any trailing HTML entities and store them for appending
          # outside the link element. The entity must be marked HTML safe in
          # order to be output literally rather than escaped.
          match.gsub!(/((?:&[\w#]+;)+)\z/, '')
          dropped = ($1 || '').html_safe

          options = link_options.merge(href: match)
          content_tag(:a, match, options) + dropped
        end
      end

      def link_options
        @link_options ||= context[:link_attr] || {}
      end
    end
  end
end