diff options
author | Douwe Maan <douwe@gitlab.com> | 2015-08-20 11:05:06 -0700 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2015-08-20 11:05:06 -0700 |
commit | e9972efc2f3d730e989907585dd1438c517a0bba (patch) | |
tree | 7a38f9638cc50813d16d55f9276db98dd7cb041c /lib | |
parent | 3ff9d5c64cef8bf8daed5e253e388545987fb945 (diff) | |
download | gitlab-ce-e9972efc2f3d730e989907585dd1438c517a0bba.tar.gz |
Extract ReplyParser and AttachmentUploader from Receiver.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/gitlab/email/attachment_uploader.rb | 35 | ||||
-rw-r--r-- | lib/gitlab/email/html_cleaner.rb | 135 | ||||
-rw-r--r-- | lib/gitlab/email/receiver.rb | 101 | ||||
-rw-r--r-- | lib/gitlab/email/reply_parser.rb | 91 | ||||
-rw-r--r-- | lib/gitlab/email_html_cleaner.rb | 133 | ||||
-rw-r--r-- | lib/gitlab/email_receiver.rb | 192 |
6 files changed, 362 insertions, 325 deletions
diff --git a/lib/gitlab/email/attachment_uploader.rb b/lib/gitlab/email/attachment_uploader.rb new file mode 100644 index 00000000000..0c0f50f2751 --- /dev/null +++ b/lib/gitlab/email/attachment_uploader.rb @@ -0,0 +1,35 @@ +module Gitlab + module Email + module AttachmentUploader + attr_accessor :message + + def initialize(message) + @message = message + end + + def execute(project) + attachments = [] + + message.attachments.each do |attachment| + tmp = Tempfile.new("gitlab-email-attachment") + begin + File.open(tmp.path, "w+b") { |f| f.write attachment.body.decoded } + + file = { + tempfile: tmp, + filename: attachment.filename, + content_type: attachment.content_type + } + + link = ::Projects::UploadService.new(project, file).execute + attachments << link if link + ensure + tmp.close! + end + end + + attachments + end + end + end +end diff --git a/lib/gitlab/email/html_cleaner.rb b/lib/gitlab/email/html_cleaner.rb new file mode 100644 index 00000000000..e1ae9eee56c --- /dev/null +++ b/lib/gitlab/email/html_cleaner.rb @@ -0,0 +1,135 @@ +# Taken mostly from Discourse's Email::HtmlCleaner +module Gitlab + module Email + # HtmlCleaner cleans up the extremely dirty HTML that many email clients + # generate by stripping out any excess divs or spans, removing styling in + # the process (which also makes the html more suitable to be parsed as + # Markdown). + class HtmlCleaner + # Elements to hoist all children out of + HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td) + # Node types to always delete + HTML_DELETE_ELEMENT_TYPES = [ + Nokogiri::XML::Node::DTD_NODE, + Nokogiri::XML::Node::COMMENT_NODE, + ] + + # Private variables: + # @doc - nokogiri document + # @out - same as @doc, but only if trimming has occured + def initialize(html) + if html.is_a?(String) + @doc = Nokogiri::HTML(html) + else + @doc = html + end + end + + class << self + # HtmlCleaner.trim(inp, opts={}) + # + # Arguments: + # inp - Either a HTML string or a Nokogiri document. + # Options: + # :return => :doc, :string + # Specify the desired return type. + # Defaults to the type of the input. + # A value of :string is equivalent to calling get_document_text() + # on the returned document. + def trim(inp, opts={}) + cleaner = HtmlCleaner.new(inp) + + opts[:return] ||= (inp.is_a?(String) ? :string : :doc) + + if opts[:return] == :string + cleaner.output_html + else + cleaner.output_document + end + end + + # HtmlCleaner.get_document_text(doc) + # + # Get the body portion of the document, including html, as a string. + def get_document_text(doc) + body = doc.xpath('//body') + if body + body.inner_html + else + doc.inner_html + end + end + end + + def output_document + @out ||= begin + doc = @doc + trim_process_node doc + add_newlines doc + doc + end + end + + def output_html + HtmlCleaner.get_document_text(output_document) + end + + private + + def add_newlines(doc) + # Replace <br> tags with a markdown \n + doc.xpath('//br').each do |br| + br.replace(new_linebreak_node doc, 2) + end + # Surround <p> tags with newlines, to help with line-wise postprocessing + # and ensure markdown paragraphs + doc.xpath('//p').each do |p| + p.before(new_linebreak_node doc) + p.after(new_linebreak_node doc, 2) + end + end + + def new_linebreak_node(doc, count=1) + Nokogiri::XML::Text.new("\n" * count, doc) + end + + def trim_process_node(node) + if should_hoist?(node) + hoisted = trim_hoist_element node + hoisted.each { |child| trim_process_node child } + elsif should_delete?(node) + node.remove + else + if children = node.children + children.each { |child| trim_process_node child } + end + end + + node + end + + def trim_hoist_element(element) + hoisted = [] + element.children.each do |child| + element.before(child) + hoisted << child + end + element.remove + hoisted + end + + def should_hoist?(node) + return false unless node.element? + HTML_HOIST_ELEMENTS.include? node.name + end + + def should_delete?(node) + return true if HTML_DELETE_ELEMENT_TYPES.include? node.type + return true if node.element? && node.name == 'head' + return true if node.text? && node.text.strip.blank? + + false + end + end + end +end diff --git a/lib/gitlab/email/receiver.rb b/lib/gitlab/email/receiver.rb new file mode 100644 index 00000000000..c46fce6afe2 --- /dev/null +++ b/lib/gitlab/email/receiver.rb @@ -0,0 +1,101 @@ +# Inspired in great part by Discourse's Email::Receiver +module Gitlab + module Email + class Receiver + class ProcessingError < StandardError; end + class EmailUnparsableError < ProcessingError; end + class EmptyEmailError < ProcessingError; end + class UserNotFoundError < ProcessingError; end + class UserNotAuthorizedError < ProcessingError; end + class NoteableNotFoundError < ProcessingError; end + class AutoGeneratedEmailError < ProcessingError; end + class SentNotificationNotFound < ProcessingError; end + class InvalidNote < ProcessingError; end + + def initialize(raw) + @raw = raw + end + + def message + @message ||= Mail::Message.new(@raw) + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e + raise EmailUnparsableError, e + end + + def execute + raise SentNotificationNotFound unless sent_notification + + raise EmptyEmailError if @raw.blank? + + raise AutoGeneratedEmailError if message.header.to_s =~ /auto-(generated|replied)/ + + author = sent_notification.recipient + + raise UserNotFoundError unless author + + project = sent_notification.project + + raise UserNotAuthorizedError unless author.can?(:create_note, project) + + raise NoteableNotFoundError unless sent_notification.noteable + + reply = ReplyParser.new(message).execute.strip + + raise EmptyEmailError if reply.blank? + + reply = add_attachments(reply) + + note = create_note(reply) + + unless note.persisted? + message = "The comment could not be created for the following reasons:" + note.errors.full_messages.each do |error| + message << "\n\n- #{error}" + end + + raise InvalidNote, message + end + end + + private + + def reply_key + reply_key = nil + message.to.each do |address| + reply_key = Gitlab::ReplyByEmail.reply_key_from_address(address) + break if reply_key + end + + reply_key + end + + def sent_notification + return nil unless reply_key + + SentNotification.for(reply_key) + end + + def add_attachments(reply) + attachments = AttachmentUploader.new(message).execute(project) + + attachments.each do |link| + text = "[#{link[:alt]}](#{link[:url]})" + text.prepend("!") if link[:is_image] + + reply << "\n\n#{text}" + end + end + + def create_note(reply) + Notes::CreateService.new( + sent_notification.project, + sent_notification.recipient, + note: reply, + noteable_type: sent_notification.noteable_type, + noteable_id: sent_notification.noteable_id, + commit_id: sent_notification.commit_id + ).execute + end + end + end +end diff --git a/lib/gitlab/email/reply_parser.rb b/lib/gitlab/email/reply_parser.rb new file mode 100644 index 00000000000..6ceb755968c --- /dev/null +++ b/lib/gitlab/email/reply_parser.rb @@ -0,0 +1,91 @@ +# Inspired in great part by Discourse's Email::Receiver +module Gitlab + module Email + class ReplyParser + attr_accessor :message + + def initialize(message) + @message = message + end + + def execute + body = select_body(message) + + encoding = body.encoding + + body = discourse_email_trimmer(body) + + body = EmailReplyParser.parse_reply(body) + + body.force_encoding(encoding).encode("UTF-8") + end + + private + + def select_body(message) + html = nil + text = nil + + if message.multipart? + html = fix_charset(message.html_part) + text = fix_charset(message.text_part) + elsif message.content_type =~ /text\/html/ + html = fix_charset(message) + end + + # prefer plain text + return text if text + + if html + body = HtmlCleaner.new(html).output_html + else + body = fix_charset(message) + end + + # Certain trigger phrases that means we didn't parse correctly + if body =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/ + return "" + end + + body + end + + # Force encoding to UTF-8 on a Mail::Message or Mail::Part + def fix_charset(object) + return nil if object.nil? + + if object.charset + object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s + else + object.body.to_s + end + rescue + nil + end + + REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date) + REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" }) + + def discourse_email_trimmer(body) + lines = body.scrub.lines.to_a + range_end = 0 + + lines.each_with_index do |l, idx| + # This one might be controversial but so many reply lines have years, times and end with a colon. + # Let's try it and see how well it works. + break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) || + (l =~ /On \w+ \d+,? \d+,?.*wrote:/) + + # Headers on subsequent lines + break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX } + # Headers on the same line + break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3 + + range_end = idx + end + + lines[0..range_end].join.strip + end + end + end +end diff --git a/lib/gitlab/email_html_cleaner.rb b/lib/gitlab/email_html_cleaner.rb deleted file mode 100644 index 6d7a17fe87c..00000000000 --- a/lib/gitlab/email_html_cleaner.rb +++ /dev/null @@ -1,133 +0,0 @@ -# Taken mostly from Discourse's Email::HtmlCleaner -module Gitlab - # HtmlCleaner cleans up the extremely dirty HTML that many email clients - # generate by stripping out any excess divs or spans, removing styling in - # the process (which also makes the html more suitable to be parsed as - # Markdown). - class EmailHtmlCleaner - # Elements to hoist all children out of - HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td) - # Node types to always delete - HTML_DELETE_ELEMENT_TYPES = [ - Nokogiri::XML::Node::DTD_NODE, - Nokogiri::XML::Node::COMMENT_NODE, - ] - - # Private variables: - # @doc - nokogiri document - # @out - same as @doc, but only if trimming has occured - def initialize(html) - if html.is_a?(String) - @doc = Nokogiri::HTML(html) - else - @doc = html - end - end - - class << self - # EmailHtmlCleaner.trim(inp, opts={}) - # - # Arguments: - # inp - Either a HTML string or a Nokogiri document. - # Options: - # :return => :doc, :string - # Specify the desired return type. - # Defaults to the type of the input. - # A value of :string is equivalent to calling get_document_text() - # on the returned document. - def trim(inp, opts={}) - cleaner = EmailHtmlCleaner.new(inp) - - opts[:return] ||= (inp.is_a?(String) ? :string : :doc) - - if opts[:return] == :string - cleaner.output_html - else - cleaner.output_document - end - end - - # EmailHtmlCleaner.get_document_text(doc) - # - # Get the body portion of the document, including html, as a string. - def get_document_text(doc) - body = doc.xpath('//body') - if body - body.inner_html - else - doc.inner_html - end - end - end - - def output_document - @out ||= begin - doc = @doc - trim_process_node doc - add_newlines doc - doc - end - end - - def output_html - EmailHtmlCleaner.get_document_text(output_document) - end - - private - - def add_newlines(doc) - # Replace <br> tags with a markdown \n - doc.xpath('//br').each do |br| - br.replace(new_linebreak_node doc, 2) - end - # Surround <p> tags with newlines, to help with line-wise postprocessing - # and ensure markdown paragraphs - doc.xpath('//p').each do |p| - p.before(new_linebreak_node doc) - p.after(new_linebreak_node doc, 2) - end - end - - def new_linebreak_node(doc, count=1) - Nokogiri::XML::Text.new("\n" * count, doc) - end - - def trim_process_node(node) - if should_hoist?(node) - hoisted = trim_hoist_element node - hoisted.each { |child| trim_process_node child } - elsif should_delete?(node) - node.remove - else - if children = node.children - children.each { |child| trim_process_node child } - end - end - - node - end - - def trim_hoist_element(element) - hoisted = [] - element.children.each do |child| - element.before(child) - hoisted << child - end - element.remove - hoisted - end - - def should_hoist?(node) - return false unless node.element? - HTML_HOIST_ELEMENTS.include? node.name - end - - def should_delete?(node) - return true if HTML_DELETE_ELEMENT_TYPES.include? node.type - return true if node.element? && node.name == 'head' - return true if node.text? && node.text.strip.blank? - - false - end - end -end diff --git a/lib/gitlab/email_receiver.rb b/lib/gitlab/email_receiver.rb deleted file mode 100644 index 3c1f346c0cf..00000000000 --- a/lib/gitlab/email_receiver.rb +++ /dev/null @@ -1,192 +0,0 @@ -# Inspired in great part by Discourse's Email::Receiver -module Gitlab - class EmailReceiver - class ProcessingError < StandardError; end - class EmailUnparsableError < ProcessingError; end - class EmptyEmailError < ProcessingError; end - class UserNotFoundError < ProcessingError; end - class UserNotAuthorizedError < ProcessingError; end - class NoteableNotFoundError < ProcessingError; end - class AutoGeneratedEmailError < ProcessingError; end - class SentNotificationNotFound < ProcessingError; end - class InvalidNote < ProcessingError; end - - def initialize(raw) - @raw = raw - end - - def message - @message ||= Mail::Message.new(@raw) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e - raise EmailUnparsableError, e - end - - def execute - raise SentNotificationNotFound unless sent_notification - - raise EmptyEmailError if @raw.blank? - - raise AutoGeneratedEmailError if message.header.to_s =~ /auto-(generated|replied)/ - - author = sent_notification.recipient - - raise UserNotFoundError unless author - - project = sent_notification.project - - raise UserNotAuthorizedError unless author.can?(:create_note, project) - - raise NoteableNotFoundError unless sent_notification.noteable - - body = parse_body(message) - - upload_attachments.each do |link| - body << "\n\n#{link}" - end - - note = Notes::CreateService.new( - project, - author, - note: body, - noteable_type: sent_notification.noteable_type, - noteable_id: sent_notification.noteable_id, - commit_id: sent_notification.commit_id - ).execute - - unless note.persisted? - message = "The comment could not be created for the following reasons:" - note.errors.full_messages.each do |error| - message << "\n\n- #{error}" - end - raise InvalidNote, message - end - end - - def parse_body(message) - body = select_body(message) - - encoding = body.encoding - raise EmptyEmailError if body.strip.blank? - - body = discourse_email_trimmer(body) - raise EmptyEmailError if body.strip.blank? - - body = EmailReplyParser.parse_reply(body) - raise EmptyEmailError if body.strip.blank? - - body.force_encoding(encoding).encode("UTF-8") - end - - private - - def reply_key - reply_key = nil - message.to.each do |address| - reply_key = Gitlab::ReplyByEmail.reply_key_from_address(address) - break if reply_key - end - - reply_key - end - - def sent_notification - return nil unless reply_key - - SentNotification.for(reply_key) - end - - def select_body(message) - html = nil - text = nil - - if message.multipart? - html = fix_charset(message.html_part) - text = fix_charset(message.text_part) - elsif message.content_type =~ /text\/html/ - html = fix_charset(message) - end - - # prefer plain text - return text if text - - if html - body = EmailHtmlCleaner.new(html).output_html - else - body = fix_charset(message) - end - - # Certain trigger phrases that means we didn't parse correctly - if body =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/ - raise EmptyEmailError - end - - body - end - - # Force encoding to UTF-8 on a Mail::Message or Mail::Part - def fix_charset(object) - return nil if object.nil? - - if object.charset - object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s - else - object.body.to_s - end - rescue - nil - end - - REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date) - REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" }) - - def discourse_email_trimmer(body) - lines = body.scrub.lines.to_a - range_end = 0 - - lines.each_with_index do |l, idx| - # This one might be controversial but so many reply lines have years, times and end with a colon. - # Let's try it and see how well it works. - break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) || - (l =~ /On \w+ \d+,? \d+,?.*wrote:/) - - # Headers on subsequent lines - break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX } - # Headers on the same line - break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3 - - range_end = idx - end - - lines[0..range_end].join.strip - end - - def upload_attachments - attachments = [] - - message.attachments.each do |attachment| - tmp = Tempfile.new("gitlab-email-attachment") - begin - File.open(tmp.path, "w+b") { |f| f.write attachment.body.decoded } - - file = { - tempfile: tmp, - filename: attachment.filename, - content_type: attachment.content_type - } - - link = ::Projects::UploadService.new(sent_notification.project, file).execute - if link - text = "[#{link[:alt]}](#{link[:url]})" - text.prepend("!") if link[:is_image] - - attachments << text - end - ensure - tmp.close! - end - end - - attachments - end - end -end |