diff options
Diffstat (limited to 'lib/gitlab/blob_helper.rb')
-rw-r--r-- | lib/gitlab/blob_helper.rb | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/lib/gitlab/blob_helper.rb b/lib/gitlab/blob_helper.rb new file mode 100644 index 00000000000..9b3b383b0c8 --- /dev/null +++ b/lib/gitlab/blob_helper.rb @@ -0,0 +1,145 @@ +# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb +module Gitlab + module BlobHelper + def extname + File.extname(name.to_s) + end + + def known_extension? + LanguageData.extensions.include?(extname) + end + + def viewable? + !large? && text? + end + + MEGABYTE = 1024 * 1024 + + def large? + size.to_i > MEGABYTE + end + + def binary? + # Large blobs aren't even loaded into memory + if data.nil? + true + + # Treat blank files as text + elsif data == "" + false + + # Charlock doesn't know what to think + elsif encoding.nil? + true + + # If Charlock says its binary + else + detect_encoding[:type] == :binary + end + end + + def text? + !binary? + end + + def image? + ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase) + end + + # Internal: Lookup mime type for extension. + # + # Returns a MIME::Type + # rubocop:disable Gitlab/ModuleWithInstanceVariables + def _mime_type + if defined? @_mime_type + @_mime_type + else + guesses = ::MIME::Types.type_for(extname.to_s) + + # Prefer text mime types over binary + @_mime_type = guesses.detect { |type| type.ascii? } || guesses.first + end + end + # rubocop:enable Gitlab/ModuleWithInstanceVariables + + # Public: Get the actual blob mime type + # + # Examples + # + # # => 'text/plain' + # # => 'text/html' + # + # Returns a mime type String. + def mime_type + _mime_type ? _mime_type.to_s : 'text/plain' + end + + def binary_mime_type? + _mime_type ? _mime_type.binary? : false + end + + def lines + @lines ||= + if viewable? && data + # `data` is usually encoded as ASCII-8BIT even when the content has + # been detected as a different encoding. However, we are not allowed + # to change the encoding of `data` because we've made the implicit + # guarantee that each entry in `lines` is encoded the same way as + # `data`. + # + # Instead, we re-encode each possible newline sequence as the + # detected encoding, then force them back to the encoding of `data` + # (usually a binary encoding like ASCII-8BIT). This means that the + # byte sequence will match how newlines are likely encoded in the + # file, but we don't have to change the encoding of `data` as far as + # Ruby is concerned. This allows us to correctly parse out each line + # without changing the encoding of `data`, and + # also--importantly--without having to duplicate many (potentially + # large) strings. + begin + data.split(encoded_newlines_re, -1) + rescue Encoding::ConverterNotFoundError + # The data is not splittable in the detected encoding. Assume it's + # one big line. + [data] + end + else + [] + end + end + + def content_type + # rubocop:disable Style/MultilineTernaryOperator + # rubocop:disable Style/NestedTernaryOperator + @content_type ||= binary_mime_type? || binary? ? mime_type : + (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain") + # rubocop:enable Style/NestedTernaryOperator + # rubocop:enable Style/MultilineTernaryOperator + end + + def encoded_newlines_re + @encoded_newlines_re ||= + Regexp.union(["\r\n", "\r", "\n"].map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }) + end + + def ruby_encoding + if hash = detect_encoding + hash[:ruby_encoding] + end + end + + def encoding + if hash = detect_encoding + hash[:encoding] + end + end + + def detect_encoding + @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables + end + + def empty? + data.nil? || data == "" + end + end +end |