Try to adopt what Discourse is using right now:adopt-email_reply_trimmer

https://gitlab.com/gitlab-org/gitlab-ce/issues/20514#note_13560164 See also: Issue #3020, #14805, #20514
author: Lin Jen-Shin <godfat@godfat.org> 2016-08-05 10:56:33 +0000
committer: Lin Jen-Shin <godfat@godfat.org> 2016-08-05 10:56:33 +0000
commit: afd00a4b640ba43231258da0368abd92677163f5 (patch)
tree: ed356011287bf6f6ed71379feadff1242cf79fa5
parent: 7b4279984ca9b517f6089931e0fa8c152e195b61 (diff)
download: gitlab-ce-adopt-email_reply_trimmer.tar.gz
4 files changed, 181 insertions, 57 deletions
diff --git a/Gemfile b/Gemfile
index 5d113db686f..dffe7fbbb8c 100644
--- a/Gemfile
+++ b/Gemfile
@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0'
 
 gem 'mail_room', '~> 0.8'
 
-gem 'email_reply_parser', '~> 0.5.8'
+gem 'email_reply_trimmer', '~> 0.1.3'
 
 gem 'ruby-prof', '~> 0.15.9'
 
diff --git a/Gemfile.lock b/Gemfile.lock
index 460a6c409e7..0da153addb7 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -179,7 +179,7 @@ GEM
       railties (>= 4.2)
     dropzonejs-rails (0.7.2)
       rails (> 3.1)
-    email_reply_parser (0.5.8)
+    email_reply_trimmer (0.1.3)
     email_spec (1.6.0)
       launchy (~> 2.1)
       mail (~> 2.2)
@@ -849,7 +849,7 @@ DEPENDENCIES
   diffy (~> 3.0.3)
   doorkeeper (~> 4.0)
   dropzonejs-rails (~> 0.7.1)
-  email_reply_parser (~> 0.5.8)
+  email_reply_trimmer (~> 0.1.3)
   email_spec (~> 1.6.0)
   factory_girl_rails (~> 4.6.0)
   ffaker (~> 2.0.0)
diff --git a/lib/gitlab/email/html_cleaner.rb b/lib/gitlab/email/html_cleaner.rb
new file mode 100644
index 00000000000..48b1247ec1f
--- /dev/null
+++ b/lib/gitlab/email/html_cleaner.rb
@@ -0,0 +1,135 @@
+# Copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/html_cleaner.rb
+module Gitlab
+  module Email
+    # HtmlCleaner cleans up the extremely dirty HTML that many email clients
+    # generate by stripping out any excess divs or spans, removing styling in
+    # the process (which also makes the html more suitable to be parsed as
+    # Markdown).
+    class HtmlCleaner
+      # Elements to hoist all children out of
+      HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
+      # Node types to always delete
+      HTML_DELETE_ELEMENT_TYPES = [
+        Nokogiri::XML::Node::DTD_NODE,
+        Nokogiri::XML::Node::COMMENT_NODE,
+      ]
+
+      # Private variables:
+      #   @doc - nokogiri document
+      #   @out - same as @doc, but only if trimming has occured
+      def initialize(html)
+        if String === html
+          @doc = Nokogiri::HTML(html)
+        else
+          @doc = html
+        end
+      end
+
+      class << self
+        # Email::HtmlCleaner.trim(inp, opts={})
+        #
+        # Arguments:
+        #   inp - Either a HTML string or a Nokogiri document.
+        # Options:
+        #   :return => :doc, :string
+        #     Specify the desired return type.
+        #     Defaults to the type of the input.
+        #     A value of :string is equivalent to calling get_document_text()
+        #     on the returned document.
+        def trim(inp, opts={})
+          cleaner = HtmlCleaner.new(inp)
+
+          opts[:return] ||= ((String === inp) ? :string : :doc)
+
+          if opts[:return] == :string
+            cleaner.output_html
+          else
+            cleaner.output_document
+          end
+        end
+
+        # Email::HtmlCleaner.get_document_text(doc)
+        #
+        # Get the body portion of the document, including html, as a string.
+        def get_document_text(doc)
+          body = doc.xpath('//body')
+          if body
+            body.inner_html
+          else
+            doc.inner_html
+          end
+        end
+      end
+
+      def output_document
+        @out ||= begin
+                   doc = @doc
+                   trim_process_node doc
+                   add_newlines doc
+                   doc
+        end
+      end
+
+      def output_html
+        HtmlCleaner.get_document_text(output_document)
+      end
+
+      private
+
+      def add_newlines(doc)
+        # Replace <br> tags with a markdown \n
+        doc.xpath('//br').each do |br|
+          br.replace(new_linebreak_node doc, 2)
+        end
+        # Surround <p> tags with newlines, to help with line-wise postprocessing
+        # and ensure markdown paragraphs
+        doc.xpath('//p').each do |p|
+          p.before(new_linebreak_node doc)
+          p.after(new_linebreak_node doc, 2)
+        end
+      end
+
+      def new_linebreak_node(doc, count=1)
+        Nokogiri::XML::Text.new("\n" * count, doc)
+      end
+
+      def trim_process_node(node)
+        if should_hoist?(node)
+          hoisted = trim_hoist_element node
+          hoisted.each { |child| trim_process_node child }
+        elsif should_delete?(node)
+          node.remove
+        else
+          if children = node.children
+            children.each { |child| trim_process_node child }
+          end
+        end
+
+        node
+      end
+
+      def trim_hoist_element(element)
+        hoisted = []
+        element.children.each do |child|
+          element.before(child)
+          hoisted << child
+        end
+        element.remove
+        hoisted
+      end
+
+      def should_hoist?(node)
+        return false unless node.element?
+        HTML_HOIST_ELEMENTS.include? node.name
+      end
+
+      def should_delete?(node)
+        return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
+        return true if node.element? && node.name == 'head'
+        return true if node.text? && node.text.strip.blank?
+
+        false
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb
index 3411eb1d9ce..7028f0ac8e2 100644
--- a/lib/gitlab/email/reply_parser.rb
+++ b/lib/gitlab/email/reply_parser.rb
@@ -1,78 +1,67 @@
 # Inspired in great part by Discourse's Email::Receiver
+# https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
 module Gitlab
   module Email
     class ReplyParser
-      attr_accessor :message
+      attr_accessor :mail
 
-      def initialize(message)
-        @message = message
+      def initialize(mail)
+        @mail = mail
       end
 
       def execute
-        body = select_body(message)
-
-        encoding = body.encoding
-
-        body = discourse_email_trimmer(body)
-
-        body = EmailReplyParser.parse_reply(body)
+        text = nil
+        html = nil
+
+        if mail.multipart?
+          text = fix_charset(mail.text_part)
+          html = fix_charset(mail.html_part)
+        elsif mail.content_type.to_s['text/html']
+          html = fix_charset(mail)
+        else
+          text = fix_charset(mail)
+        end
 
-        body.force_encoding(encoding).encode("UTF-8")
+        if html.present?
+          cleaned_html = Email::HtmlCleaner.new(html).output_html
+          EmailReplyTrimmer.trim(cleaned_html)
+        elsif text.present?
+          EmailReplyTrimmer.trim(text)
+        else
+          ''
+        end
       end
 
       private
 
-      def select_body(message)
-        text    = message.text_part if message.multipart?
-        text  ||= message           if message.content_type !~ /text\/html/
-
-        return "" unless text
+      # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
+      def fix_charset(mail_part)
+        return nil if mail_part.blank? || mail_part.body.blank?
 
-        text = fix_charset(text)
-
-        # Certain trigger phrases that means we didn't parse correctly
-        if text =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
-          return ""
-        end
+        string = mail_part.body.decoded rescue nil
 
-        text
-      end
+        return nil if string.blank?
 
-      # Force encoding to UTF-8 on a Mail::Message or Mail::Part
-      def fix_charset(object)
-        return nil if object.nil?
+        # common encodings
+        encodings = ["UTF-8", "ISO-8859-1"]
+        encodings.unshift(mail_part.charset) if mail_part.charset.present?
 
-        if object.charset
-          object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
-        else
-          object.body.to_s
+        encodings.uniq.each do |encoding|
+          fixed = try_to_encode(string, encoding)
+          return fixed if fixed.present?
         end
-      rescue
+
         nil
       end
 
-      REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
-      REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })
-
-      def discourse_email_trimmer(body)
-        lines = body.scrub.lines.to_a
-        range_end = 0
-
-        lines.each_with_index do |l, idx|
-          # This one might be controversial but so many reply lines have years, times and end with a colon.
-          # Let's try it and see how well it works.
-          break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
-                   (l =~ /On \w+ \d+,? \d+,?.*wrote:/)
-
-          # Headers on subsequent lines
-          break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX }
-          # Headers on the same line
-          break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3
-
-          range_end = idx
-        end
-
-        lines[0..range_end].join.strip
+      # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
+      def try_to_encode(string, encoding)
+        encoded = string.encode("UTF-8", encoding)
+        encoded.present? && encoded.valid_encoding? ? encoded : nil
+      rescue Encoding::InvalidByteSequenceError,
+             Encoding::UndefinedConversionError,
+             Encoding::ConverterNotFoundError
+        nil
       end
     end
   end
author	Lin Jen-Shin <godfat@godfat.org>	2016-08-05 10:56:33 +0000
committer	Lin Jen-Shin <godfat@godfat.org>	2016-08-05 10:56:33 +0000
commit	afd00a4b640ba43231258da0368abd92677163f5 (patch)
tree	ed356011287bf6f6ed71379feadff1242cf79fa5
parent	7b4279984ca9b517f6089931e0fa8c152e195b61 (diff)
download	gitlab-ce-adopt-email_reply_trimmer.tar.gz