diff options
author | GitLab Bot <gitlab-bot@gitlab.com> | 2021-05-19 15:44:42 +0000 |
---|---|---|
committer | GitLab Bot <gitlab-bot@gitlab.com> | 2021-05-19 15:44:42 +0000 |
commit | 4555e1b21c365ed8303ffb7a3325d773c9b8bf31 (patch) | |
tree | 5423a1c7516cffe36384133ade12572cf709398d /lib/gitlab/encoding_helper.rb | |
parent | e570267f2f6b326480d284e0164a6464ba4081bc (diff) | |
download | gitlab-ce-4555e1b21c365ed8303ffb7a3325d773c9b8bf31.tar.gz |
Add latest changes from gitlab-org/gitlab@13-12-stable-eev13.12.0-rc42
Diffstat (limited to 'lib/gitlab/encoding_helper.rb')
-rw-r--r-- | lib/gitlab/encoding_helper.rb | 31 |
1 files changed, 23 insertions, 8 deletions
diff --git a/lib/gitlab/encoding_helper.rb b/lib/gitlab/encoding_helper.rb index 7b79de00c66..8ee53d0de28 100644 --- a/lib/gitlab/encoding_helper.rb +++ b/lib/gitlab/encoding_helper.rb @@ -20,7 +20,7 @@ module Gitlab return message if message.valid_encoding? # return message if message type is binary - detect = CharlockHolmes::EncodingDetector.detect(message) + detect = detect_encoding(message) return message.force_encoding("BINARY") if detect_binary?(message, detect) if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD @@ -37,16 +37,30 @@ module Gitlab "--broken encoding: #{encoding}" end + def detect_encoding(data, limit: CharlockHolmes::EncodingDetector::DEFAULT_BINARY_SCAN_LEN, cache_key: nil) + return if data.nil? + + if Feature.enabled?(:cached_encoding_detection, type: :development, default_enabled: :yaml) + return CharlockHolmes::EncodingDetector.new(limit).detect(data) unless cache_key.present? + + Rails.cache.fetch([:detect_binary, CharlockHolmes::VERSION, cache_key], expires_in: 1.week) do + CharlockHolmes::EncodingDetector.new(limit).detect(data) + end + else + CharlockHolmes::EncodingDetector.new(limit).detect(data) + end + end + def detect_binary?(data, detect = nil) - detect ||= CharlockHolmes::EncodingDetector.detect(data) + detect ||= detect_encoding(data) detect && detect[:type] == :binary && detect[:confidence] == 100 end - def detect_libgit2_binary?(data) - # EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks - # only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15), - # which is what we use below to keep a consistent behavior. - detect = CharlockHolmes::EncodingDetector.new(8000).detect(data) + # EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks + # only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15), + # which is what we use below to keep a consistent behavior. + def detect_libgit2_binary?(data, cache_key: nil) + detect = detect_encoding(data, limit: 8000, cache_key: cache_key) detect && detect[:type] == :binary end @@ -54,7 +68,8 @@ module Gitlab message = force_encode_utf8(message) return message if message.valid_encoding? - detect = CharlockHolmes::EncodingDetector.detect(message) + detect = detect_encoding(message) + if detect && detect[:encoding] begin CharlockHolmes::Converter.convert(message, detect[:encoding], 'UTF-8') |