diff options
author | Douwe Maan <douwe@gitlab.com> | 2018-03-05 15:39:36 +0000 |
---|---|---|
committer | Douwe Maan <douwe@gitlab.com> | 2018-03-05 15:39:36 +0000 |
commit | 089b2fb9fc4aace5dca759eec87c980c780192d5 (patch) | |
tree | 5659343c54fced2149aa252b75c62b82e6f16641 | |
parent | fa1debefe7feb7980863aaa383d5a02e488b9a94 (diff) | |
parent | cb55bc3c0770adf7122d7cf49b12cb45c43de7ec (diff) | |
download | gitlab-ce-089b2fb9fc4aace5dca759eec87c980c780192d5.tar.gz |
Merge branch '41719-mr-title-fix' into 'master'
Render htmlentities correctly for links not supported by Rinku
Closes #41719
See merge request gitlab-org/gitlab-ce!17180
-rw-r--r-- | changelogs/unreleased/41719-mr-title-fix.yml | 5 | ||||
-rw-r--r-- | lib/banzai/filter/autolink_filter.rb | 86 | ||||
-rw-r--r-- | lib/gitlab/string_range_marker.rb | 2 | ||||
-rw-r--r-- | lib/gitlab/string_regex_marker.rb | 12 | ||||
-rw-r--r-- | spec/lib/banzai/filter/autolink_filter_spec.rb | 126 | ||||
-rw-r--r-- | spec/lib/gitlab/string_regex_marker_spec.rb | 35 |
6 files changed, 175 insertions, 91 deletions
diff --git a/changelogs/unreleased/41719-mr-title-fix.yml b/changelogs/unreleased/41719-mr-title-fix.yml new file mode 100644 index 00000000000..92388f30cb2 --- /dev/null +++ b/changelogs/unreleased/41719-mr-title-fix.yml @@ -0,0 +1,5 @@ +--- +title: Render htmlentities correctly for links not supported by Rinku +merge_request: +author: +type: fixed diff --git a/lib/banzai/filter/autolink_filter.rb b/lib/banzai/filter/autolink_filter.rb index b8d2673c1a6..75b64ae9af2 100644 --- a/lib/banzai/filter/autolink_filter.rb +++ b/lib/banzai/filter/autolink_filter.rb @@ -25,8 +25,8 @@ module Banzai # period or comma for punctuation without those characters being included # in the generated link. # - # Rubular: http://rubular.com/r/cxjPyZc7Sb - LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://\S+)(?<!,|\.)} + # Rubular: http://rubular.com/r/JzPhi6DCZp + LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)} # Text matching LINK_PATTERN inside these elements will not be linked IGNORE_PARENTS = %w(a code kbd pre script style).to_set @@ -35,53 +35,19 @@ module Banzai TEXT_QUERY = %Q(descendant-or-self::text()[ not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')}) and contains(., '://') - and not(starts-with(., 'http')) - and not(starts-with(., 'ftp')) ]).freeze + PUNCTUATION_PAIRS = { + "'" => "'", + '"' => '"', + ')' => '(', + ']' => '[', + '}' => '{' + }.freeze + def call return doc if context[:autolink] == false - rinku_parse - text_parse - end - - private - - # Run the text through Rinku as a first pass - # - # This will quickly autolink http(s) and ftp links. - # - # `@doc` will be re-parsed with the HTML String from Rinku. - def rinku_parse - # Convert the options from a Hash to a String that Rinku expects - options = tag_options(link_options) - - # NOTE: We don't parse email links because it will erroneously match - # external Commit and CommitRange references. - # - # The final argument tells Rinku to link short URLs that don't include a - # period (e.g., http://localhost:3000/) - rinku = Rinku.auto_link(html, :urls, options, IGNORE_PARENTS.to_a, 1) - - return if rinku == html - - # Rinku returns a String, so parse it back to a Nokogiri::XML::Document - # for further processing. - @doc = parse_html(rinku) - end - - # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme - def contains_unsafe?(scheme) - return false unless scheme - - scheme = scheme.strip.downcase - Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) } - end - - # Autolinks any text matching LINK_PATTERN that Rinku didn't already - # replace - def text_parse doc.xpath(TEXT_QUERY).each do |node| content = node.to_html @@ -97,6 +63,16 @@ module Banzai doc end + private + + # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme + def contains_unsafe?(scheme) + return false unless scheme + + scheme = scheme.strip.downcase + Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) } + end + def autolink_match(match) # start by stripping out dangerous links begin @@ -112,12 +88,30 @@ module Banzai match.gsub!(/((?:&[\w#]+;)+)\z/, '') dropped = ($1 || '').html_safe + # To match the behaviour of Rinku, if the matched link ends with a + # closing part of a matched pair of punctuation, we remove that trailing + # character unless there are an equal number of closing and opening + # characters in the link. + if match.end_with?(*PUNCTUATION_PAIRS.keys) + close_character = match[-1] + close_count = match.count(close_character) + open_character = PUNCTUATION_PAIRS[close_character] + open_count = match.count(open_character) + + if open_count != close_count || open_character == close_character + dropped += close_character + match = match[0..-2] + end + end + options = link_options.merge(href: match) - content_tag(:a, match, options) + dropped + content_tag(:a, match.html_safe, options) + dropped end def autolink_filter(text) - text.gsub(LINK_PATTERN) { |match| autolink_match(match) } + Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_PATTERN) do |link, left:, right:| + autolink_match(link) + end end def link_options diff --git a/lib/gitlab/string_range_marker.rb b/lib/gitlab/string_range_marker.rb index f9faa134206..c6ad997a4d4 100644 --- a/lib/gitlab/string_range_marker.rb +++ b/lib/gitlab/string_range_marker.rb @@ -14,7 +14,7 @@ module Gitlab end def mark(marker_ranges) - return rich_line unless marker_ranges + return rich_line unless marker_ranges&.any? if html_escaped rich_marker_ranges = [] diff --git a/lib/gitlab/string_regex_marker.rb b/lib/gitlab/string_regex_marker.rb index 7ebf1c0428c..b19aa6dea35 100644 --- a/lib/gitlab/string_regex_marker.rb +++ b/lib/gitlab/string_regex_marker.rb @@ -1,13 +1,15 @@ module Gitlab class StringRegexMarker < StringRangeMarker def mark(regex, group: 0, &block) - regex_match = raw_line.match(regex) - return rich_line unless regex_match + ranges = [] - begin_index, end_index = regex_match.offset(group) - name_range = begin_index..(end_index - 1) + raw_line.scan(regex) do + begin_index, end_index = Regexp.last_match.offset(group) - super([name_range], &block) + ranges << (begin_index..(end_index - 1)) + end + + super(ranges, &block) end end end diff --git a/spec/lib/banzai/filter/autolink_filter_spec.rb b/spec/lib/banzai/filter/autolink_filter_spec.rb index b7c2ff03125..b502daea418 100644 --- a/spec/lib/banzai/filter/autolink_filter_spec.rb +++ b/spec/lib/banzai/filter/autolink_filter_spec.rb @@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do include FilterSpecHelper let(:link) { 'http://about.gitlab.com/' } + let(:quotes) { ['"', "'"] } it 'does nothing when :autolink is false' do exp = act = link @@ -15,17 +16,7 @@ describe Banzai::Filter::AutolinkFilter do expect(filter(act).to_html).to eq exp end - context 'when the input contains no links' do - it 'does not parse_html back the rinku returned value' do - act = HTML::Pipeline.parse('<p>This text contains no links to autolink</p>') - - expect_any_instance_of(described_class).not_to receive(:parse_html) - - filter(act).to_html - end - end - - context 'Rinku schemes' do + context 'Various schemes' do it 'autolinks http' do doc = filter("See #{link}") expect(doc.at_css('a').text).to eq link @@ -56,32 +47,26 @@ describe Banzai::Filter::AutolinkFilter do expect(doc.at_css('a')['href']).to eq link end - it 'accepts link_attr options' do - doc = filter("See #{link}", link_attr: { class: 'custom' }) + it 'autolinks multiple URLs' do + link1 = 'http://localhost:3000/' + link2 = 'http://google.com/' - expect(doc.at_css('a')['class']).to eq 'custom' - end + doc = filter("See #{link1} and #{link2}") - described_class::IGNORE_PARENTS.each do |elem| - it "ignores valid links contained inside '#{elem}' element" do - exp = act = "<#{elem}>See #{link}</#{elem}>" - expect(filter(act).to_html).to eq exp - end - end + found_links = doc.css('a') - context 'when the input contains link' do - it 'does parse_html back the rinku returned value' do - act = HTML::Pipeline.parse("<p>See #{link}</p>") + expect(found_links.size).to eq(2) + expect(found_links[0].text).to eq(link1) + expect(found_links[0]['href']).to eq(link1) + expect(found_links[1].text).to eq(link2) + expect(found_links[1]['href']).to eq(link2) + end - expect_any_instance_of(described_class).to receive(:parse_html).at_least(:once).and_call_original + it 'accepts link_attr options' do + doc = filter("See #{link}", link_attr: { class: 'custom' }) - filter(act).to_html - end + expect(doc.at_css('a')['class']).to eq 'custom' end - end - - context 'other schemes' do - let(:link) { 'foo://bar.baz/' } it 'autolinks smb' do link = 'smb:///Volumes/shared/foo.pdf' @@ -91,6 +76,21 @@ describe Banzai::Filter::AutolinkFilter do expect(doc.at_css('a')['href']).to eq link end + it 'autolinks multiple occurences of smb' do + link1 = 'smb:///Volumes/shared/foo.pdf' + link2 = 'smb:///Volumes/shared/bar.pdf' + + doc = filter("See #{link1} and #{link2}") + + found_links = doc.css('a') + + expect(found_links.size).to eq(2) + expect(found_links[0].text).to eq(link1) + expect(found_links[0]['href']).to eq(link1) + expect(found_links[1].text).to eq(link2) + expect(found_links[1]['href']).to eq(link2) + end + it 'autolinks irc' do link = 'irc://irc.freenode.net/git' doc = filter("See #{link}") @@ -132,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do expect(doc.at_css('a').text).to eq link end + it 'includes trailing punctuation when part of a balanced pair' do + described_class::PUNCTUATION_PAIRS.each do |close, open| + next if open.in?(quotes) + + balanced_link = "#{link}#{open}abc#{close}" + balanced_actual = filter("See #{balanced_link}...") + unbalanced_link = "#{link}#{close}" + unbalanced_actual = filter("See #{unbalanced_link}...") + + expect(balanced_actual.at_css('a').text).to eq(balanced_link) + expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}...")) + expect(unbalanced_actual.at_css('a').text).to eq(link) + expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}...")) + end + end + + it 'removes trailing quotes' do + quotes.each do |quote| + balanced_link = "#{link}#{quote}abc#{quote}" + balanced_actual = filter("See #{balanced_link}...") + unbalanced_link = "#{link}#{quote}" + unbalanced_actual = filter("See #{unbalanced_link}...") + + expect(balanced_actual.at_css('a').text).to eq(balanced_link[0...-1]) + expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}...")) + expect(unbalanced_actual.at_css('a').text).to eq(link) + expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}...")) + end + end + + it 'removes one closing punctuation mark when the punctuation in the link is unbalanced' do + complicated_link = "(#{link}(a'b[c'd]))'" + expected_complicated_link = %Q{(<a href="#{link}(a'b[c'd]))">#{link}(a'b[c'd]))</a>'} + actual = unescape(filter(complicated_link).to_html) + + expect(actual).to eq(Rinku.auto_link(complicated_link)) + expect(actual).to eq(expected_complicated_link) + end + it 'does not include trailing HTML entities' do doc = filter("See <<<#{link}>>>") @@ -151,4 +190,29 @@ describe Banzai::Filter::AutolinkFilter do end end end + + context 'when the link is inside a tag' do + %w[http rdar].each do |protocol| + it "renders text after the link correctly for #{protocol}" do + doc = filter(ERB::Util.html_escape_once("<#{protocol}://link><another>")) + + expect(doc.children.last.text).to include('<another>') + end + end + end + + # Rinku does not escape these characters in HTML attributes, but content_tag + # does. We don't care about that difference for these specs, though. + def unescape(html) + %w([ ] { }).each do |cgi_escape| + html.sub!(CGI.escape(cgi_escape), cgi_escape) + end + + quotes.each do |html_escape| + html.sub!(CGI.escape_html(html_escape), html_escape) + html.sub!(CGI.escape(html_escape), CGI.escape_html(html_escape)) + end + + html + end end diff --git a/spec/lib/gitlab/string_regex_marker_spec.rb b/spec/lib/gitlab/string_regex_marker_spec.rb index d715f9bd641..37b1298b962 100644 --- a/spec/lib/gitlab/string_regex_marker_spec.rb +++ b/spec/lib/gitlab/string_regex_marker_spec.rb @@ -2,17 +2,36 @@ require 'spec_helper' describe Gitlab::StringRegexMarker do describe '#mark' do - let(:raw) { %{"name": "AFNetworking"} } - let(:rich) { %{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"AFNetworking"</span>}.html_safe } - subject do - described_class.new(raw, rich).mark(/"[^"]+":\s*"(?<name>[^"]+)"/, group: :name) do |text, left:, right:| - %{<a href="#">#{text}</a>} + context 'with a single occurrence' do + let(:raw) { %{"name": "AFNetworking"} } + let(:rich) { %{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"AFNetworking"</span>}.html_safe } + + subject do + described_class.new(raw, rich).mark(/"[^"]+":\s*"(?<name>[^"]+)"/, group: :name) do |text, left:, right:| + %{<a href="#">#{text}</a>} + end + end + + it 'marks the match' do + expect(subject).to eq(%{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"<a href="#">AFNetworking</a>"</span>}) + expect(subject).to be_html_safe end end - it 'marks the inline diffs' do - expect(subject).to eq(%{<span class="key">"name"</span><span class="punctuation">: </span><span class="value">"<a href="#">AFNetworking</a>"</span>}) - expect(subject).to be_html_safe + context 'with multiple occurrences' do + let(:raw) { %{a <b> <c> d} } + let(:rich) { %{a <b> <c> d}.html_safe } + + subject do + described_class.new(raw, rich).mark(/<[a-z]>/) do |text, left:, right:| + %{<strong>#{text}</strong>} + end + end + + it 'marks the matches' do + expect(subject).to eq(%{a <strong><b></strong> <strong><c></strong> d}) + expect(subject).to be_html_safe + end end end end |