1 files changed, 167 insertions, 78 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 8a2a7184df..ba2149ebf9 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -5,11 +5,12 @@ but random access is not allowed."""
 
 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
 
-import struct, sys, time
+import struct, sys, time, os
 import zlib
 import builtins
+import io
 
-__all__ = ["GzipFile","open"]
+__all__ = ["GzipFile", "open", "compress", "decompress"]
 
 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
 
@@ -44,7 +45,63 @@ def open(filename, mode="rb", compresslevel=9):
     """
     return GzipFile(filename, mode, compresslevel)
 
-class GzipFile:
+class _PaddedFile:
+    """Minimal read-only file object that prepends a string to the contents
+    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
+    essential functionality."""
+
+    def __init__(self, f, prepend=b''):
+        self._buffer = prepend
+        self._length = len(prepend)
+        self.file = f
+        self._read = 0
+
+    def read(self, size):
+        if self._read is None:
+            return self.file.read(size)
+        if self._read + size <= self._length:
+            read = self._read
+            self._read += size
+            return self._buffer[read:self._read]
+        else:
+            read = self._read
+            self._read = None
+            return self._buffer[read:] + \
+                   self.file.read(size-self._length+read)
+
+    def prepend(self, prepend=b'', readprevious=False):
+        if self._read is None:
+            self._buffer = prepend
+        elif readprevious and len(prepend) <= self._read:
+            self._read -= len(prepend)
+            return
+        else:
+            self._buffer = self._buffer[read:] + prepend
+        self._length = len(self._buffer)
+        self._read = 0
+
+    def unused(self):
+        if self._read is None:
+            return b''
+        return self._buffer[self._read:]
+
+    def seek(self, offset, whence=0):
+        # This is only ever called with offset=whence=0
+        if whence == 1 and self._read is not None:
+            if 0 <= offset + self._read <= self._length:
+                self._read += offset
+                return
+            else:
+                offset += self._length - self._read
+        self._read = None
+        self._buffer = None
+        return self.file.seek(offset, whence)
+
+    def __getattr__(self, name):
+        return getattr(self.file, name)
+
+
+class GzipFile(io.BufferedIOBase):
     """The GzipFile class simulates most of the methods of a file object with
     the exception of the readinto() and truncate() methods.
 
@@ -109,11 +166,16 @@ class GzipFile:
             self.mode = READ
             # Set flag indicating start of a new member
             self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
             self.extrabuf = b""
             self.extrasize = 0
+            self.extrastart = 0
             self.name = filename
             # Starts small, scales exponentially
             self.min_readsize = 100
+            fileobj = _PaddedFile(fileobj)
 
         elif mode[0:1] == 'w' or mode[0:1] == 'a':
             self.mode = WRITE
@@ -129,7 +191,6 @@ class GzipFile:
         self.fileobj = fileobj
         self.offset = 0
         self.mtime = mtime
-        self.closed = False
 
         if self.mode == WRITE:
             self._write_gzip_header()
@@ -143,7 +204,10 @@ class GzipFile:
         return self.name
 
     def __repr__(self):
-        s = repr(self.fileobj)
+        fileobj = self.fileobj
+        if isinstance(fileobj, _PaddedFile):
+            fileobj = fileobj.file
+        s = repr(fileobj)
         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 
     def _check_closed(self):
@@ -166,7 +230,8 @@ class GzipFile:
         try:
             # RFC 1952 requires the FNAME field to be Latin-1. Do not
             # include filenames that cannot be represented that way.
-            fname = self.name.encode('latin-1')
+            fname = os.path.basename(self.name)
+            fname = fname.encode('latin-1')
             if fname.endswith(b'.gz'):
                 fname = fname[:-3]
         except UnicodeEncodeError:
@@ -190,6 +255,9 @@ class GzipFile:
 
     def _read_gzip_header(self):
         magic = self.fileobj.read(2)
+        if magic == b'':
+            raise EOFError("Reached EOF")
+
         if magic != b'\037\213':
             raise IOError('Not a gzipped file')
         method = ord( self.fileobj.read(1) )
@@ -221,6 +289,10 @@ class GzipFile:
         if flag & FHCRC:
             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 
+        unused = self.fileobj.unused()
+        if unused:
+            uncompress = self.decompress.decompress(unused)
+            self._add_read_data(uncompress)
 
     def write(self,data):
         self._check_closed()
@@ -230,12 +302,19 @@ class GzipFile:
 
         if self.fileobj is None:
             raise ValueError("write() on closed GzipFile object")
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
         if len(data) > 0:
             self.size = self.size + len(data)
             self.crc = zlib.crc32(data, self.crc) & 0xffffffff
             self.fileobj.write( self.compress.compress(data) )
             self.offset += len(data)
 
+        return len(data)
+
     def read(self, size=-1):
         self._check_closed()
         if self.mode != READ:
@@ -262,15 +341,36 @@ class GzipFile:
                 if size > self.extrasize:
                     size = self.extrasize
 
-        chunk = self.extrabuf[:size]
-        self.extrabuf = self.extrabuf[size:]
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
         self.extrasize = self.extrasize - size
 
         self.offset += size
         return chunk
 
+    def peek(self, n):
+        if self.mode != READ:
+            import errno
+            raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
+
+        # Do not return ridiculously small buffers, for one common idiom
+        # is to call peek(1) and expect more bytes in return.
+        if n < 100:
+            n = 100
+        if self.extrasize == 0:
+            if self.fileobj is None:
+                return b''
+            try:
+                # 1024 is the same buffering heuristic used in read()
+                self._read(max(n, 1024))
+            except EOFError:
+                pass
+        offset = self.offset - self.extrastart
+        remaining = self.extrasize
+        assert remaining == len(self.extrabuf) - offset
+        return self.extrabuf[offset:offset + n]
+
     def _unread(self, buf):
-        self.extrabuf = buf + self.extrabuf
         self.extrasize = len(buf) + self.extrasize
         self.offset -= len(buf)
 
@@ -281,16 +381,6 @@ class GzipFile:
         if self._new_member:
             # If the _new_member flag is set, we have to
             # jump to the next member, if there is one.
-            #
-            # First, check if we're at the end of the file;
-            # if so, it's time to stop; no more members to read.
-            pos = self.fileobj.tell()   # Save current position
-            self.fileobj.seek(0, 2)     # Seek to end of file
-            if pos == self.fileobj.tell():
-                raise EOFError("Reached EOF")
-            else:
-                self.fileobj.seek( pos ) # Return to original position
-
             self._init_read()
             self._read_gzip_header()
             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
@@ -304,6 +394,9 @@ class GzipFile:
 
         if buf == b"":
             uncompress = self.decompress.flush()
+            # Prepend the already read bytes to the fileobj to they can be
+            # seen by _read_eof()
+            self.fileobj.prepend(self.decompress.unused_data, True)
             self._read_eof()
             self._add_read_data( uncompress )
             raise EOFError('Reached EOF')
@@ -315,10 +408,9 @@ class GzipFile:
             # Ending case: we've come to the end of a member in the file,
             # so seek back to the start of the unused data, finish up
             # this member, and read a new gzip header.
-            # (The number of bytes to seek back is the length of the unused
-            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
-            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
-
+            # Prepend the already read bytes to the fileobj to they can be
+            # seen by _read_eof() and _read_gzip_header()
+            self.fileobj.prepend(self.decompress.unused_data, True)
             # Check the CRC and file size, and set the flag so we read
             # a new member on the next call
             self._read_eof()
@@ -326,17 +418,17 @@ class GzipFile:
 
     def _add_read_data(self, data):
         self.crc = zlib.crc32(data, self.crc) & 0xffffffff
-        self.extrabuf = self.extrabuf + data
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
         self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
         self.size = self.size + len(data)
 
     def _read_eof(self):
-        # We've read to the end of the file, so we have to rewind in order
-        # to reread the 8 bytes containing the CRC and the file size.
+        # We've read to the end of the file
         # We check the that the computed CRC and size of the
         # uncompressed data matches the stored values.  Note that the size
         # stored is the true file size mod 2**32.
-        self.fileobj.seek(-8, 1)
         crc32 = read32(self.fileobj)
         isize = read32(self.fileobj)  # may exceed 2GB
         if crc32 != self.crc:
@@ -345,6 +437,19 @@ class GzipFile:
         elif isize != (self.size & 0xffffffff):
             raise IOError("Incorrect length of data produced")
 
+        # Gzip files can be padded with zeroes and still have archives.
+        # Consume all zero bytes and set the file position to the first
+        # non-zero byte. See http://www.gzip.org/#faq8
+        c = b"\x00"
+        while c == b"\x00":
+            c = self.fileobj.read(1)
+        if c:
+            self.fileobj.prepend(c, True)
+
+    @property
+    def closed(self):
+        return self.fileobj is None
+
     def close(self):
         if self.fileobj is None:
             return
@@ -359,16 +464,6 @@ class GzipFile:
         if self.myfileobj:
             self.myfileobj.close()
             self.myfileobj = None
-        self.closed = True
-
-    def __del__(self):
-        try:
-            if (self.myfileobj is None and
-                self.fileobj is None):
-                return
-        except AttributeError:
-            return
-        self.close()
 
     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
         self._check_closed()
@@ -385,13 +480,6 @@ class GzipFile:
         """
         return self.fileobj.fileno()
 
-    def isatty(self):
-        return False
-
-    def tell(self):
-        self._check_closed()
-        return self.offset
-
     def rewind(self):
         '''Return the uncompressed stream file position indicator to the
         beginning of the file'''
@@ -401,8 +489,18 @@ class GzipFile:
         self._new_member = True
         self.extrabuf = b""
         self.extrasize = 0
+        self.extrastart = 0
         self.offset = 0
 
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
     def seek(self, offset, whence=0):
         if whence:
             if whence == 1:
@@ -426,8 +524,18 @@ class GzipFile:
                 self.read(1024)
             self.read(count % 1024)
 
+        return self.offset
+
     def readline(self, size=-1):
         if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find(b'\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
             size = sys.maxsize
             readsize = self.min_readsize
         else:
@@ -457,41 +565,22 @@ class GzipFile:
             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
         return b''.join(bufs) # Return resulting line
 
-    def readlines(self, sizehint=0):
-        # Negative numbers result in reading all the lines
-        if sizehint <= 0:
-            sizehint = sys.maxsize
-        L = []
-        while sizehint > 0:
-            line = self.readline()
-            if line == b"":
-                break
-            L.append(line)
-            sizehint = sizehint - len(line)
-
-        return L
-
-    def writelines(self, L):
-        for line in L:
-            self.write(line)
-
-    def __iter__(self):
-        return self
 
-    def __next__(self):
-        line = self.readline()
-        if line:
-            return line
-        else:
-            raise StopIteration
-
-    def __enter__(self):
-        if self.fileobj is None:
-            raise ValueError("I/O operation on closed GzipFile object")
-        return self
-
-    def __exit__(self, *args):
-        self.close()
+def compress(data, compresslevel=9):
+    """Compress data in one shot and return the compressed string.
+    Optional argument is the compression level, in range of 1-9.
+    """
+    buf = io.BytesIO()
+    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
+        f.write(data)
+    return buf.getvalue()
+
+def decompress(data):
+    """Decompress a gzip compressed string in one shot.
+    Return the decompressed string.
+    """
+    with GzipFile(fileobj=io.BytesIO(data)) as f:
+        return f.read()
 
 
 def _test():