lib/gitlab/email/html_cleaner.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

# Copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/html_cleaner.rb
module Gitlab
  module Email
    # HtmlCleaner cleans up the extremely dirty HTML that many email clients
    # generate by stripping out any excess divs or spans, removing styling in
    # the process (which also makes the html more suitable to be parsed as
    # Markdown).
    class HtmlCleaner
      # Elements to hoist all children out of
      HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
      # Node types to always delete
      HTML_DELETE_ELEMENT_TYPES = [
        Nokogiri::XML::Node::DTD_NODE,
        Nokogiri::XML::Node::COMMENT_NODE,
      ]

      # Private variables:
      #   @doc - nokogiri document
      #   @out - same as @doc, but only if trimming has occured
      def initialize(html)
        if String === html
          @doc = Nokogiri::HTML(html)
        else
          @doc = html
        end
      end

      class << self
        # Email::HtmlCleaner.trim(inp, opts={})
        #
        # Arguments:
        #   inp - Either a HTML string or a Nokogiri document.
        # Options:
        #   :return => :doc, :string
        #     Specify the desired return type.
        #     Defaults to the type of the input.
        #     A value of :string is equivalent to calling get_document_text()
        #     on the returned document.
        def trim(inp, opts={})
          cleaner = HtmlCleaner.new(inp)

          opts[:return] ||= ((String === inp) ? :string : :doc)

          if opts[:return] == :string
            cleaner.output_html
          else
            cleaner.output_document
          end
        end

        # Email::HtmlCleaner.get_document_text(doc)
        #
        # Get the body portion of the document, including html, as a string.
        def get_document_text(doc)
          body = doc.xpath('//body')
          if body
            body.inner_html
          else
            doc.inner_html
          end
        end
      end

      def output_document
        @out ||= begin
                   doc = @doc
                   trim_process_node doc
                   add_newlines doc
                   doc
        end
      end

      def output_html
        HtmlCleaner.get_document_text(output_document)
      end

      private

      def add_newlines(doc)
        # Replace <br> tags with a markdown \n
        doc.xpath('//br').each do |br|
          br.replace(new_linebreak_node doc, 2)
        end
        # Surround <p> tags with newlines, to help with line-wise postprocessing
        # and ensure markdown paragraphs
        doc.xpath('//p').each do |p|
          p.before(new_linebreak_node doc)
          p.after(new_linebreak_node doc, 2)
        end
      end

      def new_linebreak_node(doc, count=1)
        Nokogiri::XML::Text.new("\n" * count, doc)
      end

      def trim_process_node(node)
        if should_hoist?(node)
          hoisted = trim_hoist_element node
          hoisted.each { |child| trim_process_node child }
        elsif should_delete?(node)
          node.remove
        else
          if children = node.children
            children.each { |child| trim_process_node child }
          end
        end

        node
      end

      def trim_hoist_element(element)
        hoisted = []
        element.children.each do |child|
          element.before(child)
          hoisted << child
        end
        element.remove
        hoisted
      end

      def should_hoist?(node)
        return false unless node.element?
        HTML_HOIST_ELEMENTS.include? node.name
      end

      def should_delete?(node)
        return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
        return true if node.element? && node.name == 'head'
        return true if node.text? && node.text.strip.blank?

        false
      end
    end
  end
end