diff options
Diffstat (limited to 'Lib/gzip.py')
-rw-r--r-- | Lib/gzip.py | 245 |
1 files changed, 167 insertions, 78 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py index 8a2a7184df..ba2149ebf9 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -5,11 +5,12 @@ but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module -import struct, sys, time +import struct, sys, time, os import zlib import builtins +import io -__all__ = ["GzipFile","open"] +__all__ = ["GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 @@ -44,7 +45,63 @@ def open(filename, mode="rb", compresslevel=9): """ return GzipFile(filename, mode, compresslevel) -class GzipFile: +class _PaddedFile: + """Minimal read-only file object that prepends a string to the contents + of an actual file. Shouldn't be used outside of gzip.py, as it lacks + essential functionality.""" + + def __init__(self, f, prepend=b''): + self._buffer = prepend + self._length = len(prepend) + self.file = f + self._read = 0 + + def read(self, size): + if self._read is None: + return self.file.read(size) + if self._read + size <= self._length: + read = self._read + self._read += size + return self._buffer[read:self._read] + else: + read = self._read + self._read = None + return self._buffer[read:] + \ + self.file.read(size-self._length+read) + + def prepend(self, prepend=b'', readprevious=False): + if self._read is None: + self._buffer = prepend + elif readprevious and len(prepend) <= self._read: + self._read -= len(prepend) + return + else: + self._buffer = self._buffer[read:] + prepend + self._length = len(self._buffer) + self._read = 0 + + def unused(self): + if self._read is None: + return b'' + return self._buffer[self._read:] + + def seek(self, offset, whence=0): + # This is only ever called with offset=whence=0 + if whence == 1 and self._read is not None: + if 0 <= offset + self._read <= self._length: + self._read += offset + return + else: + offset += self._length - self._read + self._read = None + self._buffer = None + return self.file.seek(offset, whence) + + def __getattr__(self, name): + return getattr(self.file, name) + + +class GzipFile(io.BufferedIOBase): """The GzipFile class simulates most of the methods of a file object with the exception of the readinto() and truncate() methods. @@ -109,11 +166,16 @@ class GzipFile: self.mode = READ # Set flag indicating start of a new member self._new_member = True + # Buffer data read from gzip file. extrastart is offset in + # stream where buffer starts. extrasize is number of + # bytes remaining in buffer from current stream position. self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.name = filename # Starts small, scales exponentially self.min_readsize = 100 + fileobj = _PaddedFile(fileobj) elif mode[0:1] == 'w' or mode[0:1] == 'a': self.mode = WRITE @@ -129,7 +191,6 @@ class GzipFile: self.fileobj = fileobj self.offset = 0 self.mtime = mtime - self.closed = False if self.mode == WRITE: self._write_gzip_header() @@ -143,7 +204,10 @@ class GzipFile: return self.name def __repr__(self): - s = repr(self.fileobj) + fileobj = self.fileobj + if isinstance(fileobj, _PaddedFile): + fileobj = fileobj.file + s = repr(fileobj) return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' def _check_closed(self): @@ -166,7 +230,8 @@ class GzipFile: try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. - fname = self.name.encode('latin-1') + fname = os.path.basename(self.name) + fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: @@ -190,6 +255,9 @@ class GzipFile: def _read_gzip_header(self): magic = self.fileobj.read(2) + if magic == b'': + raise EOFError("Reached EOF") + if magic != b'\037\213': raise IOError('Not a gzipped file') method = ord( self.fileobj.read(1) ) @@ -221,6 +289,10 @@ class GzipFile: if flag & FHCRC: self.fileobj.read(2) # Read & discard the 16-bit header CRC + unused = self.fileobj.unused() + if unused: + uncompress = self.decompress.decompress(unused) + self._add_read_data(uncompress) def write(self,data): self._check_closed() @@ -230,12 +302,19 @@ class GzipFile: if self.fileobj is None: raise ValueError("write() on closed GzipFile object") + + # Convert data type if called by io.BufferedWriter. + if isinstance(data, memoryview): + data = data.tobytes() + if len(data) > 0: self.size = self.size + len(data) self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.fileobj.write( self.compress.compress(data) ) self.offset += len(data) + return len(data) + def read(self, size=-1): self._check_closed() if self.mode != READ: @@ -262,15 +341,36 @@ class GzipFile: if size > self.extrasize: size = self.extrasize - chunk = self.extrabuf[:size] - self.extrabuf = self.extrabuf[size:] + offset = self.offset - self.extrastart + chunk = self.extrabuf[offset: offset + size] self.extrasize = self.extrasize - size self.offset += size return chunk + def peek(self, n): + if self.mode != READ: + import errno + raise IOError(errno.EBADF, "peek() on write-only GzipFile object") + + # Do not return ridiculously small buffers, for one common idiom + # is to call peek(1) and expect more bytes in return. + if n < 100: + n = 100 + if self.extrasize == 0: + if self.fileobj is None: + return b'' + try: + # 1024 is the same buffering heuristic used in read() + self._read(max(n, 1024)) + except EOFError: + pass + offset = self.offset - self.extrastart + remaining = self.extrasize + assert remaining == len(self.extrabuf) - offset + return self.extrabuf[offset:offset + n] + def _unread(self, buf): - self.extrabuf = buf + self.extrabuf self.extrasize = len(buf) + self.extrasize self.offset -= len(buf) @@ -281,16 +381,6 @@ class GzipFile: if self._new_member: # If the _new_member flag is set, we have to # jump to the next member, if there is one. - # - # First, check if we're at the end of the file; - # if so, it's time to stop; no more members to read. - pos = self.fileobj.tell() # Save current position - self.fileobj.seek(0, 2) # Seek to end of file - if pos == self.fileobj.tell(): - raise EOFError("Reached EOF") - else: - self.fileobj.seek( pos ) # Return to original position - self._init_read() self._read_gzip_header() self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) @@ -304,6 +394,9 @@ class GzipFile: if buf == b"": uncompress = self.decompress.flush() + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() + self.fileobj.prepend(self.decompress.unused_data, True) self._read_eof() self._add_read_data( uncompress ) raise EOFError('Reached EOF') @@ -315,10 +408,9 @@ class GzipFile: # Ending case: we've come to the end of a member in the file, # so seek back to the start of the unused data, finish up # this member, and read a new gzip header. - # (The number of bytes to seek back is the length of the unused - # data, minus 8 because _read_eof() will rewind a further 8 bytes) - self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) - + # Prepend the already read bytes to the fileobj to they can be + # seen by _read_eof() and _read_gzip_header() + self.fileobj.prepend(self.decompress.unused_data, True) # Check the CRC and file size, and set the flag so we read # a new member on the next call self._read_eof() @@ -326,17 +418,17 @@ class GzipFile: def _add_read_data(self, data): self.crc = zlib.crc32(data, self.crc) & 0xffffffff - self.extrabuf = self.extrabuf + data + offset = self.offset - self.extrastart + self.extrabuf = self.extrabuf[offset:] + data self.extrasize = self.extrasize + len(data) + self.extrastart = self.offset self.size = self.size + len(data) def _read_eof(self): - # We've read to the end of the file, so we have to rewind in order - # to reread the 8 bytes containing the CRC and the file size. + # We've read to the end of the file # We check the that the computed CRC and size of the # uncompressed data matches the stored values. Note that the size # stored is the true file size mod 2**32. - self.fileobj.seek(-8, 1) crc32 = read32(self.fileobj) isize = read32(self.fileobj) # may exceed 2GB if crc32 != self.crc: @@ -345,6 +437,19 @@ class GzipFile: elif isize != (self.size & 0xffffffff): raise IOError("Incorrect length of data produced") + # Gzip files can be padded with zeroes and still have archives. + # Consume all zero bytes and set the file position to the first + # non-zero byte. See http://www.gzip.org/#faq8 + c = b"\x00" + while c == b"\x00": + c = self.fileobj.read(1) + if c: + self.fileobj.prepend(c, True) + + @property + def closed(self): + return self.fileobj is None + def close(self): if self.fileobj is None: return @@ -359,16 +464,6 @@ class GzipFile: if self.myfileobj: self.myfileobj.close() self.myfileobj = None - self.closed = True - - def __del__(self): - try: - if (self.myfileobj is None and - self.fileobj is None): - return - except AttributeError: - return - self.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_closed() @@ -385,13 +480,6 @@ class GzipFile: """ return self.fileobj.fileno() - def isatty(self): - return False - - def tell(self): - self._check_closed() - return self.offset - def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' @@ -401,8 +489,18 @@ class GzipFile: self._new_member = True self.extrabuf = b"" self.extrasize = 0 + self.extrastart = 0 self.offset = 0 + def readable(self): + return self.mode == READ + + def writable(self): + return self.mode == WRITE + + def seekable(self): + return True + def seek(self, offset, whence=0): if whence: if whence == 1: @@ -426,8 +524,18 @@ class GzipFile: self.read(1024) self.read(count % 1024) + return self.offset + def readline(self, size=-1): if size < 0: + # Shortcut common case - newline found in buffer. + offset = self.offset - self.extrastart + i = self.extrabuf.find(b'\n', offset) + 1 + if i > 0: + self.extrasize -= i - offset + self.offset += i - offset + return self.extrabuf[offset: i] + size = sys.maxsize readsize = self.min_readsize else: @@ -457,41 +565,22 @@ class GzipFile: self.min_readsize = min(readsize, self.min_readsize * 2, 512) return b''.join(bufs) # Return resulting line - def readlines(self, sizehint=0): - # Negative numbers result in reading all the lines - if sizehint <= 0: - sizehint = sys.maxsize - L = [] - while sizehint > 0: - line = self.readline() - if line == b"": - break - L.append(line) - sizehint = sizehint - len(line) - - return L - - def writelines(self, L): - for line in L: - self.write(line) - - def __iter__(self): - return self - def __next__(self): - line = self.readline() - if line: - return line - else: - raise StopIteration - - def __enter__(self): - if self.fileobj is None: - raise ValueError("I/O operation on closed GzipFile object") - return self - - def __exit__(self, *args): - self.close() +def compress(data, compresslevel=9): + """Compress data in one shot and return the compressed string. + Optional argument is the compression level, in range of 1-9. + """ + buf = io.BytesIO() + with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f: + f.write(data) + return buf.getvalue() + +def decompress(data): + """Decompress a gzip compressed string in one shot. + Return the decompressed string. + """ + with GzipFile(fileobj=io.BytesIO(data)) as f: + return f.read() def _test(): |