diff options
author | Douwe Maan <douwe@gitlab.com> | 2016-12-27 14:46:47 +0000 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2016-12-27 14:46:47 +0000 |
commit | d6a37a8cef7318793b16897e0f56f71f58c4679f (patch) | |
tree | b0d3ac0051ad58a957086461c844468cfeaaa21e | |
parent | 47b35ddecd6350c422ffc20d762ed6af232c577f (diff) | |
parent | 7218daaa96badbaadf6e94b6729a74a15273935d (diff) | |
download | gitlab-ce-d6a37a8cef7318793b16897e0f56f71f58c4679f.tar.gz |
Merge branch 'bugfix/outlook-language-parsing' into 'master'
Switch to Discourse's EmailReplyTrimmer for more robust reply parsing
See merge request !7473
-rw-r--r-- | Gemfile | 2 | ||||
-rw-r--r-- | Gemfile.lock | 4 | ||||
-rw-r--r-- | lib/gitlab/email/reply_parser.rb | 36 | ||||
-rw-r--r-- | spec/lib/gitlab/email/reply_parser_spec.rb | 2 |
4 files changed, 13 insertions, 31 deletions
@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0' gem 'mail_room', '~> 0.9.0' -gem 'email_reply_parser', '~> 0.5.8' +gem 'email_reply_trimmer', '~> 0.1' gem 'html2text' gem 'ruby-prof', '~> 0.16.2' diff --git a/Gemfile.lock b/Gemfile.lock index 9f8367b420a..765d57c6238 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -167,7 +167,7 @@ GEM railties (>= 4.2) dropzonejs-rails (0.7.2) rails (> 3.1) - email_reply_parser (0.5.8) + email_reply_trimmer (0.1.6) email_spec (1.6.0) launchy (~> 2.1) mail (~> 2.2) @@ -839,7 +839,7 @@ DEPENDENCIES diffy (~> 3.1.0) doorkeeper (~> 4.2.0) dropzonejs-rails (~> 0.7.1) - email_reply_parser (~> 0.5.8) + email_reply_trimmer (~> 0.1) email_spec (~> 1.6.0) factory_girl_rails (~> 4.7.0) ffaker (~> 2.0.0) diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb index f586c5ab062..8c8dd1b9cef 100644 --- a/lib/gitlab/email/reply_parser.rb +++ b/lib/gitlab/email/reply_parser.rb @@ -13,9 +13,17 @@ module Gitlab encoding = body.encoding - body = discourse_email_trimmer(body) + body = EmailReplyTrimmer.trim(body) - body = EmailReplyParser.parse_reply(body) + return '' unless body + + # not using /\s+$/ here because that deletes empty lines + body = body.gsub(/[ \t]$/, '') + + # NOTE: We currently don't support empty quotes. + # EmailReplyTrimmer allows this as a special case, + # so we detect it manually here. + return "" if body.lines.all? { |l| l.strip.empty? || l.start_with?('>') } body.force_encoding(encoding).encode("UTF-8") end @@ -57,30 +65,6 @@ module Gitlab rescue nil end - - REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date) - REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" }) - - def discourse_email_trimmer(body) - lines = body.scrub.lines.to_a - range_end = 0 - - lines.each_with_index do |l, idx| - # This one might be controversial but so many reply lines have years, times and end with a colon. - # Let's try it and see how well it works. - break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) || - (l =~ /On \w+ \d+,? \d+,?.*wrote:/) - - # Headers on subsequent lines - break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX } - # Headers on the same line - break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3 - - range_end = idx - end - - lines[0..range_end].join.strip - end end end end diff --git a/spec/lib/gitlab/email/reply_parser_spec.rb b/spec/lib/gitlab/email/reply_parser_spec.rb index c7a0139d32a..28698e89c33 100644 --- a/spec/lib/gitlab/email/reply_parser_spec.rb +++ b/spec/lib/gitlab/email/reply_parser_spec.rb @@ -88,8 +88,6 @@ describe Gitlab::Email::ReplyParser, lib: true do expect(test_parse_body(fixture_file("emails/inline_reply.eml"))). to eq( <<-BODY.strip_heredoc.chomp - On Wed, Oct 8, 2014 at 11:12 AM, techAPJ <info@unconfigured.discourse.org> wrote: - > techAPJ <https://meta.discourse.org/users/techapj> > November 28 > |