summaryrefslogtreecommitdiff
path: root/Lib/gzip.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/gzip.py')
-rw-r--r--Lib/gzip.py245
1 files changed, 167 insertions, 78 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 8a2a7184df..ba2149ebf9 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -5,11 +5,12 @@ but random access is not allowed."""
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
-import struct, sys, time
+import struct, sys, time, os
import zlib
import builtins
+import io
-__all__ = ["GzipFile","open"]
+__all__ = ["GzipFile", "open", "compress", "decompress"]
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
@@ -44,7 +45,63 @@ def open(filename, mode="rb", compresslevel=9):
"""
return GzipFile(filename, mode, compresslevel)
-class GzipFile:
+class _PaddedFile:
+ """Minimal read-only file object that prepends a string to the contents
+ of an actual file. Shouldn't be used outside of gzip.py, as it lacks
+ essential functionality."""
+
+ def __init__(self, f, prepend=b''):
+ self._buffer = prepend
+ self._length = len(prepend)
+ self.file = f
+ self._read = 0
+
+ def read(self, size):
+ if self._read is None:
+ return self.file.read(size)
+ if self._read + size <= self._length:
+ read = self._read
+ self._read += size
+ return self._buffer[read:self._read]
+ else:
+ read = self._read
+ self._read = None
+ return self._buffer[read:] + \
+ self.file.read(size-self._length+read)
+
+ def prepend(self, prepend=b'', readprevious=False):
+ if self._read is None:
+ self._buffer = prepend
+ elif readprevious and len(prepend) <= self._read:
+ self._read -= len(prepend)
+ return
+ else:
+ self._buffer = self._buffer[read:] + prepend
+ self._length = len(self._buffer)
+ self._read = 0
+
+ def unused(self):
+ if self._read is None:
+ return b''
+ return self._buffer[self._read:]
+
+ def seek(self, offset, whence=0):
+ # This is only ever called with offset=whence=0
+ if whence == 1 and self._read is not None:
+ if 0 <= offset + self._read <= self._length:
+ self._read += offset
+ return
+ else:
+ offset += self._length - self._read
+ self._read = None
+ self._buffer = None
+ return self.file.seek(offset, whence)
+
+ def __getattr__(self, name):
+ return getattr(self.file, name)
+
+
+class GzipFile(io.BufferedIOBase):
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
@@ -109,11 +166,16 @@ class GzipFile:
self.mode = READ
# Set flag indicating start of a new member
self._new_member = True
+ # Buffer data read from gzip file. extrastart is offset in
+ # stream where buffer starts. extrasize is number of
+ # bytes remaining in buffer from current stream position.
self.extrabuf = b""
self.extrasize = 0
+ self.extrastart = 0
self.name = filename
# Starts small, scales exponentially
self.min_readsize = 100
+ fileobj = _PaddedFile(fileobj)
elif mode[0:1] == 'w' or mode[0:1] == 'a':
self.mode = WRITE
@@ -129,7 +191,6 @@ class GzipFile:
self.fileobj = fileobj
self.offset = 0
self.mtime = mtime
- self.closed = False
if self.mode == WRITE:
self._write_gzip_header()
@@ -143,7 +204,10 @@ class GzipFile:
return self.name
def __repr__(self):
- s = repr(self.fileobj)
+ fileobj = self.fileobj
+ if isinstance(fileobj, _PaddedFile):
+ fileobj = fileobj.file
+ s = repr(fileobj)
return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
def _check_closed(self):
@@ -166,7 +230,8 @@ class GzipFile:
try:
# RFC 1952 requires the FNAME field to be Latin-1. Do not
# include filenames that cannot be represented that way.
- fname = self.name.encode('latin-1')
+ fname = os.path.basename(self.name)
+ fname = fname.encode('latin-1')
if fname.endswith(b'.gz'):
fname = fname[:-3]
except UnicodeEncodeError:
@@ -190,6 +255,9 @@ class GzipFile:
def _read_gzip_header(self):
magic = self.fileobj.read(2)
+ if magic == b'':
+ raise EOFError("Reached EOF")
+
if magic != b'\037\213':
raise IOError('Not a gzipped file')
method = ord( self.fileobj.read(1) )
@@ -221,6 +289,10 @@ class GzipFile:
if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC
+ unused = self.fileobj.unused()
+ if unused:
+ uncompress = self.decompress.decompress(unused)
+ self._add_read_data(uncompress)
def write(self,data):
self._check_closed()
@@ -230,12 +302,19 @@ class GzipFile:
if self.fileobj is None:
raise ValueError("write() on closed GzipFile object")
+
+ # Convert data type if called by io.BufferedWriter.
+ if isinstance(data, memoryview):
+ data = data.tobytes()
+
if len(data) > 0:
self.size = self.size + len(data)
self.crc = zlib.crc32(data, self.crc) & 0xffffffff
self.fileobj.write( self.compress.compress(data) )
self.offset += len(data)
+ return len(data)
+
def read(self, size=-1):
self._check_closed()
if self.mode != READ:
@@ -262,15 +341,36 @@ class GzipFile:
if size > self.extrasize:
size = self.extrasize
- chunk = self.extrabuf[:size]
- self.extrabuf = self.extrabuf[size:]
+ offset = self.offset - self.extrastart
+ chunk = self.extrabuf[offset: offset + size]
self.extrasize = self.extrasize - size
self.offset += size
return chunk
+ def peek(self, n):
+ if self.mode != READ:
+ import errno
+ raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
+
+ # Do not return ridiculously small buffers, for one common idiom
+ # is to call peek(1) and expect more bytes in return.
+ if n < 100:
+ n = 100
+ if self.extrasize == 0:
+ if self.fileobj is None:
+ return b''
+ try:
+ # 1024 is the same buffering heuristic used in read()
+ self._read(max(n, 1024))
+ except EOFError:
+ pass
+ offset = self.offset - self.extrastart
+ remaining = self.extrasize
+ assert remaining == len(self.extrabuf) - offset
+ return self.extrabuf[offset:offset + n]
+
def _unread(self, buf):
- self.extrabuf = buf + self.extrabuf
self.extrasize = len(buf) + self.extrasize
self.offset -= len(buf)
@@ -281,16 +381,6 @@ class GzipFile:
if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
- #
- # First, check if we're at the end of the file;
- # if so, it's time to stop; no more members to read.
- pos = self.fileobj.tell() # Save current position
- self.fileobj.seek(0, 2) # Seek to end of file
- if pos == self.fileobj.tell():
- raise EOFError("Reached EOF")
- else:
- self.fileobj.seek( pos ) # Return to original position
-
self._init_read()
self._read_gzip_header()
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
@@ -304,6 +394,9 @@ class GzipFile:
if buf == b"":
uncompress = self.decompress.flush()
+ # Prepend the already read bytes to the fileobj to they can be
+ # seen by _read_eof()
+ self.fileobj.prepend(self.decompress.unused_data, True)
self._read_eof()
self._add_read_data( uncompress )
raise EOFError('Reached EOF')
@@ -315,10 +408,9 @@ class GzipFile:
# Ending case: we've come to the end of a member in the file,
# so seek back to the start of the unused data, finish up
# this member, and read a new gzip header.
- # (The number of bytes to seek back is the length of the unused
- # data, minus 8 because _read_eof() will rewind a further 8 bytes)
- self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
-
+ # Prepend the already read bytes to the fileobj to they can be
+ # seen by _read_eof() and _read_gzip_header()
+ self.fileobj.prepend(self.decompress.unused_data, True)
# Check the CRC and file size, and set the flag so we read
# a new member on the next call
self._read_eof()
@@ -326,17 +418,17 @@ class GzipFile:
def _add_read_data(self, data):
self.crc = zlib.crc32(data, self.crc) & 0xffffffff
- self.extrabuf = self.extrabuf + data
+ offset = self.offset - self.extrastart
+ self.extrabuf = self.extrabuf[offset:] + data
self.extrasize = self.extrasize + len(data)
+ self.extrastart = self.offset
self.size = self.size + len(data)
def _read_eof(self):
- # We've read to the end of the file, so we have to rewind in order
- # to reread the 8 bytes containing the CRC and the file size.
+ # We've read to the end of the file
# We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
- self.fileobj.seek(-8, 1)
crc32 = read32(self.fileobj)
isize = read32(self.fileobj) # may exceed 2GB
if crc32 != self.crc:
@@ -345,6 +437,19 @@ class GzipFile:
elif isize != (self.size & 0xffffffff):
raise IOError("Incorrect length of data produced")
+ # Gzip files can be padded with zeroes and still have archives.
+ # Consume all zero bytes and set the file position to the first
+ # non-zero byte. See http://www.gzip.org/#faq8
+ c = b"\x00"
+ while c == b"\x00":
+ c = self.fileobj.read(1)
+ if c:
+ self.fileobj.prepend(c, True)
+
+ @property
+ def closed(self):
+ return self.fileobj is None
+
def close(self):
if self.fileobj is None:
return
@@ -359,16 +464,6 @@ class GzipFile:
if self.myfileobj:
self.myfileobj.close()
self.myfileobj = None
- self.closed = True
-
- def __del__(self):
- try:
- if (self.myfileobj is None and
- self.fileobj is None):
- return
- except AttributeError:
- return
- self.close()
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
self._check_closed()
@@ -385,13 +480,6 @@ class GzipFile:
"""
return self.fileobj.fileno()
- def isatty(self):
- return False
-
- def tell(self):
- self._check_closed()
- return self.offset
-
def rewind(self):
'''Return the uncompressed stream file position indicator to the
beginning of the file'''
@@ -401,8 +489,18 @@ class GzipFile:
self._new_member = True
self.extrabuf = b""
self.extrasize = 0
+ self.extrastart = 0
self.offset = 0
+ def readable(self):
+ return self.mode == READ
+
+ def writable(self):
+ return self.mode == WRITE
+
+ def seekable(self):
+ return True
+
def seek(self, offset, whence=0):
if whence:
if whence == 1:
@@ -426,8 +524,18 @@ class GzipFile:
self.read(1024)
self.read(count % 1024)
+ return self.offset
+
def readline(self, size=-1):
if size < 0:
+ # Shortcut common case - newline found in buffer.
+ offset = self.offset - self.extrastart
+ i = self.extrabuf.find(b'\n', offset) + 1
+ if i > 0:
+ self.extrasize -= i - offset
+ self.offset += i - offset
+ return self.extrabuf[offset: i]
+
size = sys.maxsize
readsize = self.min_readsize
else:
@@ -457,41 +565,22 @@ class GzipFile:
self.min_readsize = min(readsize, self.min_readsize * 2, 512)
return b''.join(bufs) # Return resulting line
- def readlines(self, sizehint=0):
- # Negative numbers result in reading all the lines
- if sizehint <= 0:
- sizehint = sys.maxsize
- L = []
- while sizehint > 0:
- line = self.readline()
- if line == b"":
- break
- L.append(line)
- sizehint = sizehint - len(line)
-
- return L
-
- def writelines(self, L):
- for line in L:
- self.write(line)
-
- def __iter__(self):
- return self
- def __next__(self):
- line = self.readline()
- if line:
- return line
- else:
- raise StopIteration
-
- def __enter__(self):
- if self.fileobj is None:
- raise ValueError("I/O operation on closed GzipFile object")
- return self
-
- def __exit__(self, *args):
- self.close()
+def compress(data, compresslevel=9):
+ """Compress data in one shot and return the compressed string.
+ Optional argument is the compression level, in range of 1-9.
+ """
+ buf = io.BytesIO()
+ with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
+ f.write(data)
+ return buf.getvalue()
+
+def decompress(data):
+ """Decompress a gzip compressed string in one shot.
+ Return the decompressed string.
+ """
+ with GzipFile(fileobj=io.BytesIO(data)) as f:
+ return f.read()
def _test():