1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
module Gitlab
module BlobHelper
def extname
File.extname(name.to_s)
end
def known_extension?
LanguageData.extensions.include?(extname)
end
def viewable?
!large? && text?
end
MEGABYTE = 1024 * 1024
def large?
size.to_i > MEGABYTE
end
def binary?
# Large blobs aren't even loaded into memory
if data.nil?
true
# Treat blank files as text
elsif data == ""
false
# Charlock doesn't know what to think
elsif encoding.nil?
true
# If Charlock says its binary
else
detect_encoding[:type] == :binary
end
end
def text?
!binary?
end
def image?
['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
end
# Internal: Lookup mime type for extension.
#
# Returns a MIME::Type
# rubocop:disable Gitlab/ModuleWithInstanceVariables
def _mime_type
if defined? @_mime_type
@_mime_type
else
guesses = ::MIME::Types.type_for(extname.to_s)
# Prefer text mime types over binary
@_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
end
end
# rubocop:enable Gitlab/ModuleWithInstanceVariables
# Public: Get the actual blob mime type
#
# Examples
#
# # => 'text/plain'
# # => 'text/html'
#
# Returns a mime type String.
def mime_type
_mime_type ? _mime_type.to_s : 'text/plain'
end
def binary_mime_type?
_mime_type ? _mime_type.binary? : false
end
def lines
@lines ||=
if viewable? && data
# `data` is usually encoded as ASCII-8BIT even when the content has
# been detected as a different encoding. However, we are not allowed
# to change the encoding of `data` because we've made the implicit
# guarantee that each entry in `lines` is encoded the same way as
# `data`.
#
# Instead, we re-encode each possible newline sequence as the
# detected encoding, then force them back to the encoding of `data`
# (usually a binary encoding like ASCII-8BIT). This means that the
# byte sequence will match how newlines are likely encoded in the
# file, but we don't have to change the encoding of `data` as far as
# Ruby is concerned. This allows us to correctly parse out each line
# without changing the encoding of `data`, and
# also--importantly--without having to duplicate many (potentially
# large) strings.
begin
data.split(encoded_newlines_re, -1)
rescue Encoding::ConverterNotFoundError
# The data is not splittable in the detected encoding. Assume it's
# one big line.
[data]
end
else
[]
end
end
def content_type
# rubocop:disable Style/MultilineTernaryOperator
# rubocop:disable Style/NestedTernaryOperator
@content_type ||= binary_mime_type? || binary? ? mime_type :
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
# rubocop:enable Style/NestedTernaryOperator
# rubocop:enable Style/MultilineTernaryOperator
end
def encoded_newlines_re
@encoded_newlines_re ||=
Regexp.union(["\r\n", "\r", "\n"].map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
end
def ruby_encoding
if hash = detect_encoding
hash[:ruby_encoding]
end
end
def encoding
if hash = detect_encoding
hash[:encoding]
end
end
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
def empty?
data.nil? || data == ""
end
end
end
|