diff options
author | Lin Jen-Shin <godfat@godfat.org> | 2016-08-05 10:56:33 +0000 |
---|---|---|
committer | Lin Jen-Shin <godfat@godfat.org> | 2016-08-05 10:56:33 +0000 |
commit | afd00a4b640ba43231258da0368abd92677163f5 (patch) | |
tree | ed356011287bf6f6ed71379feadff1242cf79fa5 | |
parent | 7b4279984ca9b517f6089931e0fa8c152e195b61 (diff) | |
download | gitlab-ce-adopt-email_reply_trimmer.tar.gz |
Try to adopt what Discourse is using right now:adopt-email_reply_trimmer
https://gitlab.com/gitlab-org/gitlab-ce/issues/20514#note_13560164
See also:
Issue #3020, #14805, #20514
-rw-r--r-- | Gemfile | 2 | ||||
-rw-r--r-- | Gemfile.lock | 4 | ||||
-rw-r--r-- | lib/gitlab/email/html_cleaner.rb | 135 | ||||
-rw-r--r-- | lib/gitlab/email/reply_parser.rb | 97 |
4 files changed, 181 insertions, 57 deletions
@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0' gem 'mail_room', '~> 0.8' -gem 'email_reply_parser', '~> 0.5.8' +gem 'email_reply_trimmer', '~> 0.1.3' gem 'ruby-prof', '~> 0.15.9' diff --git a/Gemfile.lock b/Gemfile.lock index 460a6c409e7..0da153addb7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -179,7 +179,7 @@ GEM railties (>= 4.2) dropzonejs-rails (0.7.2) rails (> 3.1) - email_reply_parser (0.5.8) + email_reply_trimmer (0.1.3) email_spec (1.6.0) launchy (~> 2.1) mail (~> 2.2) @@ -849,7 +849,7 @@ DEPENDENCIES diffy (~> 3.0.3) doorkeeper (~> 4.0) dropzonejs-rails (~> 0.7.1) - email_reply_parser (~> 0.5.8) + email_reply_trimmer (~> 0.1.3) email_spec (~> 1.6.0) factory_girl_rails (~> 4.6.0) ffaker (~> 2.0.0) diff --git a/lib/gitlab/email/html_cleaner.rb b/lib/gitlab/email/html_cleaner.rb new file mode 100644 index 00000000000..48b1247ec1f --- /dev/null +++ b/lib/gitlab/email/html_cleaner.rb @@ -0,0 +1,135 @@ +# Copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/html_cleaner.rb +module Gitlab + module Email + # HtmlCleaner cleans up the extremely dirty HTML that many email clients + # generate by stripping out any excess divs or spans, removing styling in + # the process (which also makes the html more suitable to be parsed as + # Markdown). + class HtmlCleaner + # Elements to hoist all children out of + HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td) + # Node types to always delete + HTML_DELETE_ELEMENT_TYPES = [ + Nokogiri::XML::Node::DTD_NODE, + Nokogiri::XML::Node::COMMENT_NODE, + ] + + # Private variables: + # @doc - nokogiri document + # @out - same as @doc, but only if trimming has occured + def initialize(html) + if String === html + @doc = Nokogiri::HTML(html) + else + @doc = html + end + end + + class << self + # Email::HtmlCleaner.trim(inp, opts={}) + # + # Arguments: + # inp - Either a HTML string or a Nokogiri document. + # Options: + # :return => :doc, :string + # Specify the desired return type. + # Defaults to the type of the input. + # A value of :string is equivalent to calling get_document_text() + # on the returned document. + def trim(inp, opts={}) + cleaner = HtmlCleaner.new(inp) + + opts[:return] ||= ((String === inp) ? :string : :doc) + + if opts[:return] == :string + cleaner.output_html + else + cleaner.output_document + end + end + + # Email::HtmlCleaner.get_document_text(doc) + # + # Get the body portion of the document, including html, as a string. + def get_document_text(doc) + body = doc.xpath('//body') + if body + body.inner_html + else + doc.inner_html + end + end + end + + def output_document + @out ||= begin + doc = @doc + trim_process_node doc + add_newlines doc + doc + end + end + + def output_html + HtmlCleaner.get_document_text(output_document) + end + + private + + def add_newlines(doc) + # Replace <br> tags with a markdown \n + doc.xpath('//br').each do |br| + br.replace(new_linebreak_node doc, 2) + end + # Surround <p> tags with newlines, to help with line-wise postprocessing + # and ensure markdown paragraphs + doc.xpath('//p').each do |p| + p.before(new_linebreak_node doc) + p.after(new_linebreak_node doc, 2) + end + end + + def new_linebreak_node(doc, count=1) + Nokogiri::XML::Text.new("\n" * count, doc) + end + + def trim_process_node(node) + if should_hoist?(node) + hoisted = trim_hoist_element node + hoisted.each { |child| trim_process_node child } + elsif should_delete?(node) + node.remove + else + if children = node.children + children.each { |child| trim_process_node child } + end + end + + node + end + + def trim_hoist_element(element) + hoisted = [] + element.children.each do |child| + element.before(child) + hoisted << child + end + element.remove + hoisted + end + + def should_hoist?(node) + return false unless node.element? + HTML_HOIST_ELEMENTS.include? node.name + end + + def should_delete?(node) + return true if HTML_DELETE_ELEMENT_TYPES.include? node.type + return true if node.element? && node.name == 'head' + return true if node.text? && node.text.strip.blank? + + false + end + end + end +end diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb index 3411eb1d9ce..7028f0ac8e2 100644 --- a/lib/gitlab/email/reply_parser.rb +++ b/lib/gitlab/email/reply_parser.rb @@ -1,78 +1,67 @@ # Inspired in great part by Discourse's Email::Receiver +# https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb module Gitlab module Email class ReplyParser - attr_accessor :message + attr_accessor :mail - def initialize(message) - @message = message + def initialize(mail) + @mail = mail end def execute - body = select_body(message) - - encoding = body.encoding - - body = discourse_email_trimmer(body) - - body = EmailReplyParser.parse_reply(body) + text = nil + html = nil + + if mail.multipart? + text = fix_charset(mail.text_part) + html = fix_charset(mail.html_part) + elsif mail.content_type.to_s['text/html'] + html = fix_charset(mail) + else + text = fix_charset(mail) + end - body.force_encoding(encoding).encode("UTF-8") + if html.present? + cleaned_html = Email::HtmlCleaner.new(html).output_html + EmailReplyTrimmer.trim(cleaned_html) + elsif text.present? + EmailReplyTrimmer.trim(text) + else + '' + end end private - def select_body(message) - text = message.text_part if message.multipart? - text ||= message if message.content_type !~ /text\/html/ - - return "" unless text + # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb + def fix_charset(mail_part) + return nil if mail_part.blank? || mail_part.body.blank? - text = fix_charset(text) - - # Certain trigger phrases that means we didn't parse correctly - if text =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/ - return "" - end + string = mail_part.body.decoded rescue nil - text - end + return nil if string.blank? - # Force encoding to UTF-8 on a Mail::Message or Mail::Part - def fix_charset(object) - return nil if object.nil? + # common encodings + encodings = ["UTF-8", "ISO-8859-1"] + encodings.unshift(mail_part.charset) if mail_part.charset.present? - if object.charset - object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s - else - object.body.to_s + encodings.uniq.each do |encoding| + fixed = try_to_encode(string, encoding) + return fixed if fixed.present? end - rescue + nil end - REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date) - REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" }) - - def discourse_email_trimmer(body) - lines = body.scrub.lines.to_a - range_end = 0 - - lines.each_with_index do |l, idx| - # This one might be controversial but so many reply lines have years, times and end with a colon. - # Let's try it and see how well it works. - break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) || - (l =~ /On \w+ \d+,? \d+,?.*wrote:/) - - # Headers on subsequent lines - break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX } - # Headers on the same line - break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3 - - range_end = idx - end - - lines[0..range_end].join.strip + # copied from https://github.com/discourse/discourse/blob/e92f5e4fbf04a88d37dc5069917090abf6c07dec/lib/email/receiver.rb + def try_to_encode(string, encoding) + encoded = string.encode("UTF-8", encoding) + encoded.present? && encoded.valid_encoding? ? encoded : nil + rescue Encoding::InvalidByteSequenceError, + Encoding::UndefinedConversionError, + Encoding::ConverterNotFoundError + nil end end end |