summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLin Jen-Shin <godfat@godfat.org>2016-08-05 10:56:33 +0000
committerLin Jen-Shin <godfat@godfat.org>2016-08-05 10:56:33 +0000
commitafd00a4b640ba43231258da0368abd92677163f5 (patch)
treeed356011287bf6f6ed71379feadff1242cf79fa5
parent7b4279984ca9b517f6089931e0fa8c152e195b61 (diff)
downloadgitlab-ce-adopt-email_reply_trimmer.tar.gz
Try to adopt what Discourse is using right now:adopt-email_reply_trimmer
https://gitlab.com/gitlab-org/gitlab-ce/issues/20514#note_13560164 See also: Issue #3020, #14805, #20514
-rw-r--r--Gemfile2
-rw-r--r--Gemfile.lock4
-rw-r--r--lib/gitlab/email/html_cleaner.rb135
-rw-r--r--lib/gitlab/email/reply_parser.rb97
4 files changed, 181 insertions, 57 deletions
diff --git a/Gemfile b/Gemfile
index 5d113db686f..dffe7fbbb8c 100644
--- a/Gemfile
+++ b/Gemfile
@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0'
gem 'mail_room', '~> 0.8'
-gem 'email_reply_parser', '~> 0.5.8'
+gem 'email_reply_trimmer', '~> 0.1.3'
gem 'ruby-prof', '~> 0.15.9'
diff --git a/Gemfile.lock b/Gemfile.lock
index 460a6c409e7..0da153addb7 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -179,7 +179,7 @@ GEM
railties (>= 4.2)
dropzonejs-rails (0.7.2)
rails (> 3.1)
- email_reply_parser (0.5.8)
+ email_reply_trimmer (0.1.3)
email_spec (1.6.0)
launchy (~> 2.1)
mail (~> 2.2)
@@ -849,7 +849,7 @@ DEPENDENCIES
diffy (~> 3.0.3)
doorkeeper (~> 4.0)
dropzonejs-rails (~> 0.7.1)
- email_reply_parser (~> 0.5.8)
+ email_reply_trimmer (~> 0.1.3)
email_spec (~> 1.6.0)
factory_girl_rails (~> 4.6.0)
ffaker (~> 2.0.0)
diff --git a/lib/gitlab/email/html_cleaner.rb b/lib/gitlab/email/html_cleaner.rb
new file mode 100644
index 00000000000..48b1247ec1f
--- /dev/null
+++ b/lib/gitlab/email/html_cleaner.rb
@@ -0,0 +1,135 @@
+# Copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/html_cleaner.rb
+module Gitlab
+ module Email
+ # HtmlCleaner cleans up the extremely dirty HTML that many email clients
+ # generate by stripping out any excess divs or spans, removing styling in
+ # the process (which also makes the html more suitable to be parsed as
+ # Markdown).
+ class HtmlCleaner
+ # Elements to hoist all children out of
+ HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
+ # Node types to always delete
+ HTML_DELETE_ELEMENT_TYPES = [
+ Nokogiri::XML::Node::DTD_NODE,
+ Nokogiri::XML::Node::COMMENT_NODE,
+ ]
+
+ # Private variables:
+ # @doc - nokogiri document
+ # @out - same as @doc, but only if trimming has occured
+ def initialize(html)
+ if String === html
+ @doc = Nokogiri::HTML(html)
+ else
+ @doc = html
+ end
+ end
+
+ class << self
+ # Email::HtmlCleaner.trim(inp, opts={})
+ #
+ # Arguments:
+ # inp - Either a HTML string or a Nokogiri document.
+ # Options:
+ # :return => :doc, :string
+ # Specify the desired return type.
+ # Defaults to the type of the input.
+ # A value of :string is equivalent to calling get_document_text()
+ # on the returned document.
+ def trim(inp, opts={})
+ cleaner = HtmlCleaner.new(inp)
+
+ opts[:return] ||= ((String === inp) ? :string : :doc)
+
+ if opts[:return] == :string
+ cleaner.output_html
+ else
+ cleaner.output_document
+ end
+ end
+
+ # Email::HtmlCleaner.get_document_text(doc)
+ #
+ # Get the body portion of the document, including html, as a string.
+ def get_document_text(doc)
+ body = doc.xpath('//body')
+ if body
+ body.inner_html
+ else
+ doc.inner_html
+ end
+ end
+ end
+
+ def output_document
+ @out ||= begin
+ doc = @doc
+ trim_process_node doc
+ add_newlines doc
+ doc
+ end
+ end
+
+ def output_html
+ HtmlCleaner.get_document_text(output_document)
+ end
+
+ private
+
+ def add_newlines(doc)
+ # Replace <br> tags with a markdown \n
+ doc.xpath('//br').each do |br|
+ br.replace(new_linebreak_node doc, 2)
+ end
+ # Surround <p> tags with newlines, to help with line-wise postprocessing
+ # and ensure markdown paragraphs
+ doc.xpath('//p').each do |p|
+ p.before(new_linebreak_node doc)
+ p.after(new_linebreak_node doc, 2)
+ end
+ end
+
+ def new_linebreak_node(doc, count=1)
+ Nokogiri::XML::Text.new("\n" * count, doc)
+ end
+
+ def trim_process_node(node)
+ if should_hoist?(node)
+ hoisted = trim_hoist_element node
+ hoisted.each { |child| trim_process_node child }
+ elsif should_delete?(node)
+ node.remove
+ else
+ if children = node.children
+ children.each { |child| trim_process_node child }
+ end
+ end
+
+ node
+ end
+
+ def trim_hoist_element(element)
+ hoisted = []
+ element.children.each do |child|
+ element.before(child)
+ hoisted << child
+ end
+ element.remove
+ hoisted
+ end
+
+ def should_hoist?(node)
+ return false unless node.element?
+ HTML_HOIST_ELEMENTS.include? node.name
+ end
+
+ def should_delete?(node)
+ return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
+ return true if node.element? && node.name == 'head'
+ return true if node.text? && node.text.strip.blank?
+
+ false
+ end
+ end
+ end
+end
diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb
index 3411eb1d9ce..7028f0ac8e2 100644
--- a/lib/gitlab/email/reply_parser.rb
+++ b/lib/gitlab/email/reply_parser.rb
@@ -1,78 +1,67 @@
# Inspired in great part by Discourse's Email::Receiver
+# https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
module Gitlab
module Email
class ReplyParser
- attr_accessor :message
+ attr_accessor :mail
- def initialize(message)
- @message = message
+ def initialize(mail)
+ @mail = mail
end
def execute
- body = select_body(message)
-
- encoding = body.encoding
-
- body = discourse_email_trimmer(body)
-
- body = EmailReplyParser.parse_reply(body)
+ text = nil
+ html = nil
+
+ if mail.multipart?
+ text = fix_charset(mail.text_part)
+ html = fix_charset(mail.html_part)
+ elsif mail.content_type.to_s['text/html']
+ html = fix_charset(mail)
+ else
+ text = fix_charset(mail)
+ end
- body.force_encoding(encoding).encode("UTF-8")
+ if html.present?
+ cleaned_html = Email::HtmlCleaner.new(html).output_html
+ EmailReplyTrimmer.trim(cleaned_html)
+ elsif text.present?
+ EmailReplyTrimmer.trim(text)
+ else
+ ''
+ end
end
private
- def select_body(message)
- text = message.text_part if message.multipart?
- text ||= message if message.content_type !~ /text\/html/
-
- return "" unless text
+ # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
+ def fix_charset(mail_part)
+ return nil if mail_part.blank? || mail_part.body.blank?
- text = fix_charset(text)
-
- # Certain trigger phrases that means we didn't parse correctly
- if text =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
- return ""
- end
+ string = mail_part.body.decoded rescue nil
- text
- end
+ return nil if string.blank?
- # Force encoding to UTF-8 on a Mail::Message or Mail::Part
- def fix_charset(object)
- return nil if object.nil?
+ # common encodings
+ encodings = ["UTF-8", "ISO-8859-1"]
+ encodings.unshift(mail_part.charset) if mail_part.charset.present?
- if object.charset
- object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
- else
- object.body.to_s
+ encodings.uniq.each do |encoding|
+ fixed = try_to_encode(string, encoding)
+ return fixed if fixed.present?
end
- rescue
+
nil
end
- REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
- REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })
-
- def discourse_email_trimmer(body)
- lines = body.scrub.lines.to_a
- range_end = 0
-
- lines.each_with_index do |l, idx|
- # This one might be controversial but so many reply lines have years, times and end with a colon.
- # Let's try it and see how well it works.
- break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
- (l =~ /On \w+ \d+,? \d+,?.*wrote:/)
-
- # Headers on subsequent lines
- break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX }
- # Headers on the same line
- break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3
-
- range_end = idx
- end
-
- lines[0..range_end].join.strip
+ # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb
+ def try_to_encode(string, encoding)
+ encoded = string.encode("UTF-8", encoding)
+ encoded.present? && encoded.valid_encoding? ? encoded : nil
+ rescue Encoding::InvalidByteSequenceError,
+ Encoding::UndefinedConversionError,
+ Encoding::ConverterNotFoundError
+ nil
end
end
end