summaryrefslogtreecommitdiff
path: root/magic/__init__.py
blob: f2fd34d1f3763d806e67afa1773cc48d7d74eab8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
"""
magic is a wrapper around the libmagic file identification library.

See README for more information.

Usage:

>>> import magic
>>> magic.from_file("testdata/test.pdf")
'PDF document, version 1.2'
>>> magic.from_file("testdata/test.pdf", mime=True)
'application/pdf'
>>> magic.from_buffer(open("testdata/test.pdf").read(1024))
'PDF document, version 1.2'
>>>

"""

import sys
import glob
import ctypes
import ctypes.util
import threading
import logging

from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER

# avoid shadowing the real open with the version from compat.py
_real_open = open


class MagicException(Exception):
    def __init__(self, message):
        super(Exception, self).__init__(message)
        self.message = message


class Magic:
    """
    Magic is a wrapper around the libmagic C library.
    """

    def __init__(self, mime=False, magic_file=None, mime_encoding=False,
                 keep_going=False, uncompress=False, raw=False, extension=False):
        """
        Create a new libmagic wrapper.

        mime - if True, mimetypes are returned instead of textual descriptions
        mime_encoding - if True, codec is returned
        magic_file - use a mime database other than the system default
        keep_going - don't stop at the first match, keep going
        uncompress - Try to look inside compressed files.
        raw - Do not try to decode "non-printable" chars.
        extension - Print a slash-separated list of valid extensions for the file type found.
        """

        self.cookie = None
        self.flags = MAGIC_NONE
        if mime:
            self.flags |= MAGIC_MIME_TYPE
        if mime_encoding:
            self.flags |= MAGIC_MIME_ENCODING
        if keep_going:
            self.flags |= MAGIC_CONTINUE
        if uncompress:
            self.flags |= MAGIC_COMPRESS
        if raw:
            self.flags |= MAGIC_RAW
        if extension:
            self.flags |= MAGIC_EXTENSION

        self.cookie = magic_open(self.flags)
        self.lock = threading.Lock()

        magic_load(self.cookie, magic_file)

        # MAGIC_EXTENSION was added in 523 or 524, so bail if
        # it doesn't appear to be available
        if extension and (not _has_version or version() < 524):
            raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic')

        # For https://github.com/ahupp/python-magic/issues/190
        # libmagic has fixed internal limits that some files exceed, causing
        # an error.  We can avoid this (at least for the sample file given)
        # by bumping the limit up.  It's not clear if this is a general solution
        # or whether other internal limits should be increased, but given
        # the lack of other reports I'll assume this is rare.
        if _has_param:
            try:
                self.setparam(MAGIC_PARAM_NAME_MAX, 64)
            except MagicException as e:
                # some versions of libmagic fail this call,
                # so rather than fail hard just use default behavior
                pass

    def from_buffer(self, buf):
        """
        Identify the contents of `buf`
        """
        with self.lock:
            try:
                # if we're on python3, convert buf to bytes
                # otherwise this string is passed as wchar*
                # which is not what libmagic expects
                if type(buf) == str and str != bytes:
                    buf = buf.encode('utf-8', errors='replace')
                return maybe_decode(magic_buffer(self.cookie, buf))
            except MagicException as e:
                return self._handle509Bug(e)

    def from_file(self, filename):
        # raise FileNotFoundException or IOError if the file does not exist
        with _real_open(filename):
            pass

        with self.lock:
            try:
                return maybe_decode(magic_file(self.cookie, filename))
            except MagicException as e:
                return self._handle509Bug(e)

    def from_descriptor(self, fd):
        with self.lock:
            try:
                return maybe_decode(magic_descriptor(self.cookie, fd))
            except MagicException as e:
                return self._handle509Bug(e)

    def _handle509Bug(self, e):
        # libmagic 5.09 has a bug where it might fail to identify the
        # mimetype of a file and returns null from magic_file (and
        # likely _buffer), but also does not return an error message.
        if e.message is None and (self.flags & MAGIC_MIME_TYPE):
            return "application/octet-stream"
        else:
            raise e

    def setparam(self, param, val):
        return magic_setparam(self.cookie, param, val)

    def getparam(self, param):
        return magic_getparam(self.cookie, param)

    def __del__(self):
        # no _thread_check here because there can be no other
        # references to this object at this point.

        # during shutdown magic_close may have been cleared already so
        # make sure it exists before using it.

        # the self.cookie check should be unnecessary and was an
        # incorrect fix for a threading problem, however I'm leaving
        # it in because it's harmless and I'm slightly afraid to
        # remove it.
        if self.cookie and magic_close:
            magic_close(self.cookie)
            self.cookie = None


_instances = {}


def _get_magic_type(mime):
    i = _instances.get(mime)
    if i is None:
        i = _instances[mime] = Magic(mime=mime)
    return i


def from_file(filename, mime=False):
    """"
    Accepts a filename and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> magic.from_file("testdata/test.pdf", mime=True)
    'application/pdf'
    """
    m = _get_magic_type(mime)
    return m.from_file(filename)


def from_buffer(buffer, mime=False):
    """
    Accepts a binary string and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> magic.from_buffer(open("testdata/test.pdf").read(1024))
    'PDF document, version 1.2'
    """
    m = _get_magic_type(mime)
    return m.from_buffer(buffer)


def from_descriptor(fd, mime=False):
    """
    Accepts a file descriptor and returns the detected filetype.  Return
    value is the mimetype if mime=True, otherwise a human readable
    name.

    >>> f = open("testdata/test.pdf")
    >>> magic.from_descriptor(f.fileno())
    'PDF document, version 1.2'
    """
    m = _get_magic_type(mime)
    return m.from_descriptor(fd)


libmagic = None
# Let's try to find magic or magic1
dll = ctypes.util.find_library('magic') \
      or ctypes.util.find_library('magic1') \
      or ctypes.util.find_library('cygmagic-1') \
      or ctypes.util.find_library('libmagic-1') \
      or ctypes.util.find_library('msys-magic-1')  # for MSYS2

# necessary because find_library returns None if it doesn't find the library
if dll:
    libmagic = ctypes.CDLL(dll)

if not libmagic or not libmagic._name:
    windows_dlls = ['magic1.dll', 'cygmagic-1.dll', 'libmagic-1.dll', 'msys-magic-1.dll']
    platform_to_lib = {'darwin': ['/opt/local/lib/libmagic.dylib',
                                  '/usr/local/lib/libmagic.dylib'] +
                                 # Assumes there will only be one version installed
                                 glob.glob('/usr/local/Cellar/libmagic/*/lib/libmagic.dylib'),  # flake8:noqa
                       'win32': windows_dlls,
                       'cygwin': windows_dlls,
                       'linux': ['libmagic.so.1'],
                       # fallback for some Linuxes (e.g. Alpine) where library search does not work # flake8:noqa
                       }
    platform = 'linux' if sys.platform.startswith('linux') else sys.platform
    for dll in platform_to_lib.get(platform, []):
        try:
            libmagic = ctypes.CDLL(dll)
            break
        except OSError:
            pass

if not libmagic or not libmagic._name:
    # It is better to raise an ImportError since we are importing magic module
    raise ImportError('failed to find libmagic.  Check your installation')

magic_t = ctypes.c_void_p


def errorcheck_null(result, func, args):
    if result is None:
        err = magic_error(args[0])
        raise MagicException(err)
    else:
        return result


def errorcheck_negative_one(result, func, args):
    if result == -1:
        err = magic_error(args[0])
        raise MagicException(err)
    else:
        return result


# return str on python3.  Don't want to unconditionally
# decode because that results in unicode on python2
def maybe_decode(s):
    if str == bytes:
        return s
    else:
        # backslashreplace here because sometimes libmagic will return metadata in the charset
        # of the file, which is unknown to us (e.g the title of a Word doc)
        return s.decode('utf-8', 'backslashreplace')


def coerce_filename(filename):
    if filename is None:
        return None
    # ctypes will implicitly convert unicode strings to bytes with
    # .encode('ascii').  If you use the filesystem encoding
    # then you'll get inconsistent behavior (crashes) depending on the user's
    # LANG environment variable
    is_unicode = (sys.version_info[0] <= 2 and
                 isinstance(filename, unicode)) or \
                 (sys.version_info[0] >= 3 and
                  isinstance(filename, str))
    if is_unicode:
        return filename.encode('utf-8', 'surrogateescape')
    else:
        return filename


magic_open = libmagic.magic_open
magic_open.restype = magic_t
magic_open.argtypes = [c_int]

magic_close = libmagic.magic_close
magic_close.restype = None
magic_close.argtypes = [magic_t]

magic_error = libmagic.magic_error
magic_error.restype = c_char_p
magic_error.argtypes = [magic_t]

magic_errno = libmagic.magic_errno
magic_errno.restype = c_int
magic_errno.argtypes = [magic_t]

_magic_file = libmagic.magic_file
_magic_file.restype = c_char_p
_magic_file.argtypes = [magic_t, c_char_p]
_magic_file.errcheck = errorcheck_null


def magic_file(cookie, filename):
    return _magic_file(cookie, coerce_filename(filename))


_magic_buffer = libmagic.magic_buffer
_magic_buffer.restype = c_char_p
_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
_magic_buffer.errcheck = errorcheck_null


def magic_buffer(cookie, buf):
    return _magic_buffer(cookie, buf, len(buf))


magic_descriptor = libmagic.magic_descriptor
magic_descriptor.restype = c_char_p
magic_descriptor.argtypes = [magic_t, c_int]
magic_descriptor.errcheck = errorcheck_null

_magic_descriptor = libmagic.magic_descriptor
_magic_descriptor.restype = c_char_p
_magic_descriptor.argtypes = [magic_t, c_int]
_magic_descriptor.errcheck = errorcheck_null


def magic_descriptor(cookie, fd):
    return _magic_descriptor(cookie, fd)


_magic_load = libmagic.magic_load
_magic_load.restype = c_int
_magic_load.argtypes = [magic_t, c_char_p]
_magic_load.errcheck = errorcheck_negative_one


def magic_load(cookie, filename):
    return _magic_load(cookie, coerce_filename(filename))


magic_setflags = libmagic.magic_setflags
magic_setflags.restype = c_int
magic_setflags.argtypes = [magic_t, c_int]

magic_check = libmagic.magic_check
magic_check.restype = c_int
magic_check.argtypes = [magic_t, c_char_p]

magic_compile = libmagic.magic_compile
magic_compile.restype = c_int
magic_compile.argtypes = [magic_t, c_char_p]

_has_param = False
if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'):
    _has_param = True
    _magic_setparam = libmagic.magic_setparam
    _magic_setparam.restype = c_int
    _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
    _magic_setparam.errcheck = errorcheck_negative_one

    _magic_getparam = libmagic.magic_getparam
    _magic_getparam.restype = c_int
    _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)]
    _magic_getparam.errcheck = errorcheck_negative_one


def magic_setparam(cookie, param, val):
    if not _has_param:
        raise NotImplementedError("magic_setparam not implemented")
    v = c_size_t(val)
    return _magic_setparam(cookie, param, byref(v))


def magic_getparam(cookie, param):
    if not _has_param:
        raise NotImplementedError("magic_getparam not implemented")
    val = c_size_t()
    _magic_getparam(cookie, param, byref(val))
    return val.value


_has_version = False
if hasattr(libmagic, "magic_version"):
    _has_version = True
    magic_version = libmagic.magic_version
    magic_version.restype = c_int
    magic_version.argtypes = []


def version():
    if not _has_version:
        raise NotImplementedError("magic_version not implemented")
    return magic_version()


MAGIC_NONE = 0x000000  # No flags
MAGIC_DEBUG = 0x000001  # Turn on debugging
MAGIC_SYMLINK = 0x000002  # Follow symlinks
MAGIC_COMPRESS = 0x000004  # Check inside compressed files
MAGIC_DEVICES = 0x000008  # Look at the contents of devices
MAGIC_MIME_TYPE = 0x000010  # Return a mime string
MAGIC_MIME_ENCODING = 0x000400  # Return the MIME encoding
# TODO:  should be
# MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING
MAGIC_MIME = 0x000010  # Return a mime string
MAGIC_EXTENSION = 0x1000000  # Return a /-separated list of extensions

MAGIC_CONTINUE = 0x000020  # Return all matches
MAGIC_CHECK = 0x000040  # Print warnings to stderr
MAGIC_PRESERVE_ATIME = 0x000080  # Restore access time on exit
MAGIC_RAW = 0x000100  # Don't translate unprintable chars
MAGIC_ERROR = 0x000200  # Handle ENOENT etc as real errors

MAGIC_NO_CHECK_COMPRESS = 0x001000  # Don't check for compressed files
MAGIC_NO_CHECK_TAR = 0x002000  # Don't check for tar files
MAGIC_NO_CHECK_SOFT = 0x004000  # Don't check magic entries
MAGIC_NO_CHECK_APPTYPE = 0x008000  # Don't check application type
MAGIC_NO_CHECK_ELF = 0x010000  # Don't check for elf details
MAGIC_NO_CHECK_ASCII = 0x020000  # Don't check for ascii files
MAGIC_NO_CHECK_TROFF = 0x040000  # Don't check ascii/troff
MAGIC_NO_CHECK_FORTRAN = 0x080000  # Don't check ascii/fortran
MAGIC_NO_CHECK_TOKENS = 0x100000  # Don't check ascii/tokens

MAGIC_PARAM_INDIR_MAX = 0  # Recursion limit for indirect magic
MAGIC_PARAM_NAME_MAX = 1  # Use count limit for name/use magic
MAGIC_PARAM_ELF_PHNUM_MAX = 2  # Max ELF notes processed
MAGIC_PARAM_ELF_SHNUM_MAX = 3  # Max ELF program sections processed
MAGIC_PARAM_ELF_NOTES_MAX = 4  # # Max ELF sections processed
MAGIC_PARAM_REGEX_MAX = 5  # Length limit for regex searches
MAGIC_PARAM_BYTES_MAX = 6  # Max number of bytes to read from file


# This package name conflicts with the one provided by upstream
# libmagic.  This is a common source of confusion for users.  To
# resolve, We ship a copy of that module, and expose it's functions
# wrapped in deprecation warnings.
def _add_compat(to_module):
    import warnings, re
    from magic import compat

    def deprecation_wrapper(fn):
        def _(*args, **kwargs):
            warnings.warn(
                "Using compatability mode with libmagic's python binding. "
                "See https://github.com/ahupp/python-magic/blob/master/COMPAT.md for details.",
                PendingDeprecationWarning)

            return fn(*args, **kwargs)

        return _

    fn = ['detect_from_filename',
          'detect_from_content',
          'detect_from_fobj',
          'open']
    for fname in fn:
        to_module[fname] = deprecation_wrapper(compat.__dict__[fname])

    # copy constants over, ensuring there's no conflicts
    is_const_re = re.compile("^[A-Z_]+$")
    allowed_inconsistent = set(['MAGIC_MIME'])
    for name, value in compat.__dict__.items():
        if is_const_re.match(name):
            if name in to_module:
                if name in allowed_inconsistent:
                    continue
                if to_module[name] != value:
                    raise Exception("inconsistent value for " + name)
                else:
                    continue
            else:
                to_module[name] = value


_add_compat(globals())