1 files changed, 303 insertions, 114 deletions
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index b223b4a4c1..ff64c90829 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -22,7 +22,18 @@ except ImportError:
     zlib = None
     crc32 = binascii.crc32
 
-__all__ = ["BadZipFile", "BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED",
+try:
+    import bz2 # We may need its compression method
+except ImportError:
+    bz2 = None
+
+try:
+    import lzma # We may need its compression method
+except ImportError:
+    lzma = None
+
+__all__ = ["BadZipFile", "BadZipfile", "error",
+           "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
            "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]
 
 class BadZipFile(Exception):
@@ -45,8 +56,17 @@ ZIP_MAX_COMMENT = (1 << 16) - 1
 # constants for Zip file compression methods
 ZIP_STORED = 0
 ZIP_DEFLATED = 8
+ZIP_BZIP2 = 12
+ZIP_LZMA = 14
 # Other ZIP compression methods not supported
 
+DEFAULT_VERSION = 20
+ZIP64_VERSION = 45
+BZIP2_VERSION = 46
+LZMA_VERSION = 63
+# we recognize (but not necessarily support) all features up to that version
+MAX_EXTRACT_VERSION = 63
+
 # Below are some formats and associated data for reading/writing headers using
 # the struct module.  The names and structures of headers/records are those used
 # in the PKWARE description of the ZIP file format:
@@ -322,8 +342,8 @@ class ZipInfo (object):
         else:
             # Assume everything else is unix-y
             self.create_system = 3          # System which created ZIP archive
-        self.create_version = 20        # Version which created ZIP archive
-        self.extract_version = 20       # Version needed to extract archive
+        self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
+        self.extract_version = DEFAULT_VERSION # Version needed to extract archive
         self.reserved = 0               # Must be zero
         self.flag_bits = 0              # ZIP flag bits
         self.volume = 0                 # Volume number of file header
@@ -350,6 +370,7 @@ class ZipInfo (object):
 
         extra = self.extra
 
+        min_version = 0
         if zip64 is None:
             zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
         if zip64:
@@ -363,9 +384,15 @@ class ZipInfo (object):
             # fall back to the ZIP64 extension
             file_size = 0xffffffff
             compress_size = 0xffffffff
-            self.extract_version = max(45, self.extract_version)
-            self.create_version = max(45, self.extract_version)
+            min_version = ZIP64_VERSION
+
+        if self.compress_type == ZIP_BZIP2:
+            min_version = max(BZIP2_VERSION, min_version)
+        elif self.compress_type == ZIP_LZMA:
+            min_version = max(LZMA_VERSION, min_version)
 
+        self.extract_version = max(min_version, self.extract_version)
+        self.create_version = max(min_version, self.create_version)
         filename, flag_bits = self._encodeFilenameFlags()
         header = struct.pack(structFileHeader, stringFileHeader,
                  self.extract_version, self.reserved, flag_bits,
@@ -476,6 +503,57 @@ class _ZipDecrypter:
         return c
 
 
+class LZMACompressor:
+
+    def __init__(self):
+        self._comp = None
+
+    def _init(self):
+        props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
+        self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
+                lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
+        ])
+        return struct.pack('<BBH', 9, 4, len(props)) + props
+
+    def compress(self, data):
+        if self._comp is None:
+            return self._init() + self._comp.compress(data)
+        return self._comp.compress(data)
+
+    def flush(self):
+        if self._comp is None:
+            return self._init() + self._comp.flush()
+        return self._comp.flush()
+
+
+class LZMADecompressor:
+
+    def __init__(self):
+        self._decomp = None
+        self._unconsumed = b''
+        self.eof = False
+
+    def decompress(self, data):
+        if self._decomp is None:
+            self._unconsumed += data
+            if len(self._unconsumed) <= 4:
+                return b''
+            psize, = struct.unpack('<H', self._unconsumed[2:4])
+            if len(self._unconsumed) <= 4 + psize:
+                return b''
+
+            self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
+                    lzma._decode_filter_properties(lzma.FILTER_LZMA1,
+                            self._unconsumed[4:4 + psize])
+            ])
+            data = self._unconsumed[4 + psize:]
+            del self._unconsumed
+
+        result = self._decomp.decompress(data)
+        self.eof = self._decomp.eof
+        return result
+
+
 compressor_names = {
     0: 'store',
     1: 'shrink',
@@ -496,6 +574,53 @@ compressor_names = {
     98: 'ppmd',
 }
 
+def _check_compression(compression):
+    if compression == ZIP_STORED:
+        pass
+    elif compression == ZIP_DEFLATED:
+        if not zlib:
+            raise RuntimeError(
+                    "Compression requires the (missing) zlib module")
+    elif compression == ZIP_BZIP2:
+        if not bz2:
+            raise RuntimeError(
+                    "Compression requires the (missing) bz2 module")
+    elif compression == ZIP_LZMA:
+        if not lzma:
+            raise RuntimeError(
+                    "Compression requires the (missing) lzma module")
+    else:
+        raise RuntimeError("That compression method is not supported")
+
+
+def _get_compressor(compress_type):
+    if compress_type == ZIP_DEFLATED:
+        return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
+             zlib.DEFLATED, -15)
+    elif compress_type == ZIP_BZIP2:
+        return bz2.BZ2Compressor()
+    elif compress_type == ZIP_LZMA:
+        return LZMACompressor()
+    else:
+        return None
+
+
+def _get_decompressor(compress_type):
+    if compress_type == ZIP_STORED:
+        return None
+    elif compress_type == ZIP_DEFLATED:
+        return zlib.decompressobj(-15)
+    elif compress_type == ZIP_BZIP2:
+        return bz2.BZ2Decompressor()
+    elif compress_type == ZIP_LZMA:
+        return LZMADecompressor()
+    else:
+        descr = compressor_names.get(compress_type)
+        if descr:
+            raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
+        else:
+            raise NotImplementedError("compression type %d" % (compress_type,))
+
 
 class ZipExtFile(io.BufferedIOBase):
     """File-like object for reading an archive member.
@@ -518,19 +643,12 @@ class ZipExtFile(io.BufferedIOBase):
         self._close_fileobj = close_fileobj
 
         self._compress_type = zipinfo.compress_type
-        self._compress_size = zipinfo.compress_size
         self._compress_left = zipinfo.compress_size
+        self._left = zipinfo.file_size
 
-        if self._compress_type == ZIP_DEFLATED:
-            self._decompressor = zlib.decompressobj(-15)
-        elif self._compress_type != ZIP_STORED:
-            descr = compressor_names.get(self._compress_type)
-            if descr:
-                raise NotImplementedError("compression type %d (%s)" % (self._compress_type, descr))
-            else:
-                raise NotImplementedError("compression type %d" % (self._compress_type,))
-        self._unconsumed = b''
+        self._decompressor = _get_decompressor(self._compress_type)
 
+        self._eof = False
         self._readbuffer = b''
         self._offset = 0
 
@@ -605,7 +723,11 @@ class ZipExtFile(io.BufferedIOBase):
         """Returns buffered bytes without advancing the position."""
         if n > len(self._readbuffer) - self._offset:
             chunk = self.read(n)
-            self._offset -= len(chunk)
+            if len(chunk) > self._offset:
+                self._readbuffer = chunk + self._readbuffer[self._offset:]
+                self._offset = 0
+            else:
+                self._offset -= len(chunk)
 
         # Return up to 512 bytes to reduce allocation overhead for tight loops.
         return self._readbuffer[self._offset: self._offset + 512]
@@ -617,80 +739,132 @@ class ZipExtFile(io.BufferedIOBase):
         """Read and return up to n bytes.
         If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
         """
-        buf = b''
-        if n is None:
-            n = -1
-        while True:
-            if n < 0:
-                data = self.read1(n)
-            elif n > len(buf):
-                data = self.read1(n - len(buf))
-            else:
-                return buf
-            if len(data) == 0:
-                return buf
+        if n is None or n < 0:
+            buf = self._readbuffer[self._offset:]
+            self._readbuffer = b''
+            self._offset = 0
+            while not self._eof:
+                buf += self._read1(self.MAX_N)
+            return buf
+
+        end = n + self._offset
+        if end < len(self._readbuffer):
+            buf = self._readbuffer[self._offset:end]
+            self._offset = end
+            return buf
+
+        n = end - len(self._readbuffer)
+        buf = self._readbuffer[self._offset:]
+        self._readbuffer = b''
+        self._offset = 0
+        while n > 0 and not self._eof:
+            data = self._read1(n)
+            if n < len(data):
+                self._readbuffer = data
+                self._offset = n
+                buf += data[:n]
+                break
             buf += data
+            n -= len(data)
+        return buf
 
-    def _update_crc(self, newdata, eof):
+    def _update_crc(self, newdata):
         # Update the CRC using the given data.
         if self._expected_crc is None:
             # No need to compute the CRC if we don't have a reference value
             return
         self._running_crc = crc32(newdata, self._running_crc) & 0xffffffff
         # Check the CRC if we're at the end of the file
-        if eof and self._running_crc != self._expected_crc:
+        if self._eof and self._running_crc != self._expected_crc:
             raise BadZipFile("Bad CRC-32 for file %r" % self.name)
 
     def read1(self, n):
         """Read up to n bytes with at most one read() system call."""
 
-        # Simplify algorithm (branching) by transforming negative n to large n.
-        if n < 0 or n is None:
-            n = self.MAX_N
-
-        # Bytes available in read buffer.
-        len_readbuffer = len(self._readbuffer) - self._offset
+        if n is None or n < 0:
+            buf = self._readbuffer[self._offset:]
+            self._readbuffer = b''
+            self._offset = 0
+            while not self._eof:
+                data = self._read1(self.MAX_N)
+                if data:
+                    buf += data
+                    break
+            return buf
 
-        # Read from file.
-        if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
-            nbytes = n - len_readbuffer - len(self._unconsumed)
-            nbytes = max(nbytes, self.MIN_READ_SIZE)
-            nbytes = min(nbytes, self._compress_left)
+        end = n + self._offset
+        if end < len(self._readbuffer):
+            buf = self._readbuffer[self._offset:end]
+            self._offset = end
+            return buf
 
-            data = self._fileobj.read(nbytes)
-            self._compress_left -= len(data)
+        n = end - len(self._readbuffer)
+        buf = self._readbuffer[self._offset:]
+        self._readbuffer = b''
+        self._offset = 0
+        if n > 0:
+            while not self._eof:
+                data = self._read1(n)
+                if n < len(data):
+                    self._readbuffer = data
+                    self._offset = n
+                    buf += data[:n]
+                    break
+                if data:
+                    buf += data
+                    break
+        return buf
 
-            if data and self._decrypter is not None:
-                data = bytes(map(self._decrypter, data))
+    def _read1(self, n):
+        # Read up to n compressed bytes with at most one read() system call,
+        # decrypt and decompress them.
+        if self._eof or n <= 0:
+            return b''
 
-            if self._compress_type == ZIP_STORED:
-                self._update_crc(data, eof=(self._compress_left==0))
-                self._readbuffer = self._readbuffer[self._offset:] + data
-                self._offset = 0
-            else:
-                # Prepare deflated bytes for decompression.
-                self._unconsumed += data
-
-        # Handle unconsumed data.
-        if (len(self._unconsumed) > 0 and n > len_readbuffer and
-            self._compress_type == ZIP_DEFLATED):
-            data = self._decompressor.decompress(
-                self._unconsumed,
-                max(n - len_readbuffer, self.MIN_READ_SIZE)
-            )
-
-            self._unconsumed = self._decompressor.unconsumed_tail
-            eof = len(self._unconsumed) == 0 and self._compress_left == 0
-            if eof:
+        # Read from file.
+        if self._compress_type == ZIP_DEFLATED:
+            ## Handle unconsumed data.
+            data = self._decompressor.unconsumed_tail
+            if n > len(data):
+                data += self._read2(n - len(data))
+        else:
+            data = self._read2(n)
+
+        if self._compress_type == ZIP_STORED:
+            self._eof = self._compress_left <= 0
+        elif self._compress_type == ZIP_DEFLATED:
+            n = max(n, self.MIN_READ_SIZE)
+            data = self._decompressor.decompress(data, n)
+            self._eof = (self._decompressor.eof or
+                    self._compress_left <= 0 and
+                    not self._decompressor.unconsumed_tail)
+            if self._eof:
                 data += self._decompressor.flush()
+        else:
+            data = self._decompressor.decompress(data)
+            self._eof = self._decompressor.eof or self._compress_left <= 0
+
+        data = data[:self._left]
+        self._left -= len(data)
+        if self._left <= 0:
+            self._eof = True
+        self._update_crc(data)
+        return data
 
-            self._update_crc(data, eof=eof)
-            self._readbuffer = self._readbuffer[self._offset:] + data
-            self._offset = 0
+    def _read2(self, n):
+        if self._compress_left <= 0:
+            return b''
+
+        n = max(n, self.MIN_READ_SIZE)
+        n = min(n, self._compress_left)
+
+        data = self._fileobj.read(n)
+        self._compress_left -= len(data)
+        if not data:
+            raise EOFError
 
-        # Read from buffer.
-        data = self._readbuffer[self._offset: self._offset + n]
-        self._offset += len(data)
+        if self._decrypter is not None:
+            data = bytes(map(self._decrypter, data))
         return data
 
     def close(self):
@@ -709,7 +883,8 @@ class ZipFile:
     file: Either the path to the file, or a file-like object.
           If it is a path, the file will be opened and closed by ZipFile.
     mode: The mode can be either read "r", write "w" or append "a".
-    compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
+    compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
+                 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
     allowZip64: if True ZipFile will create files with ZIP64 extensions when
                 needed, otherwise it will raise an exception when this would
                 be necessary.
@@ -717,20 +892,14 @@ class ZipFile:
     """
 
     fp = None                   # Set here since __del__ checks it
+    _windows_illegal_name_trans_table = None
 
     def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
         """Open the ZIP file with mode read "r", write "w" or append "a"."""
         if mode not in ("r", "w", "a"):
             raise RuntimeError('ZipFile() requires mode "r", "w", or "a"')
 
-        if compression == ZIP_STORED:
-            pass
-        elif compression == ZIP_DEFLATED:
-            if not zlib:
-                raise RuntimeError(
-                      "Compression requires the (missing) zlib module")
-        else:
-            raise RuntimeError("That compression method is not supported")
+        _check_compression(compression)
 
         self._allowZip64 = allowZip64
         self._didModify = False
@@ -851,6 +1020,9 @@ class ZipFile:
             (x.create_version, x.create_system, x.extract_version, x.reserved,
                 x.flag_bits, x.compress_type, t, d,
                 x.CRC, x.compress_size, x.file_size) = centdir[1:12]
+            if x.extract_version > MAX_EXTRACT_VERSION:
+                raise NotImplementedError("zip file version %.1f" %
+                                          (x.extract_version / 10))
             x.volume, x.internal_attr, x.external_attr = centdir[15:18]
             # Convert date/time code to (year, month, day, hour, min, sec)
             x._raw_time = t
@@ -873,10 +1045,7 @@ class ZipFile:
 
     def namelist(self):
         """Return a list of file names in the archive."""
-        l = []
-        for data in self.filelist:
-            l.append(data.filename)
-        return l
+        return [data.filename for data in self.filelist]
 
     def infolist(self):
         """Return a list of class ZipInfo instances for files in the
@@ -933,10 +1102,10 @@ class ZipFile:
         if not isinstance(comment, bytes):
             raise TypeError("comment: expected bytes, got %s" % type(comment))
         # check for valid comment length
-        if len(comment) >= ZIP_MAX_COMMENT:
-            if self.debug:
-                print('Archive comment is too long; truncating to %d bytes'
-                        % ZIP_MAX_COMMENT)
+        if len(comment) > ZIP_MAX_COMMENT:
+            import warnings
+            warnings.warn('Archive comment is too long; truncating to %d bytes'
+                          % ZIP_MAX_COMMENT, stacklevel=2)
             comment = comment[:ZIP_MAX_COMMENT]
         self._comment = comment
         self._didModify = True
@@ -985,6 +1154,14 @@ class ZipFile:
             if fheader[_FH_EXTRA_FIELD_LENGTH]:
                 zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
 
+            if zinfo.flag_bits & 0x20:
+                # Zip 2.7: compressed patched data
+                raise NotImplementedError("compressed patched data (flag bit 5)")
+
+            if zinfo.flag_bits & 0x40:
+                # strong encryption
+                raise NotImplementedError("strong encryption (flag bit 6)")
+
             if zinfo.flag_bits & 0x800:
                 # UTF-8 filename
                 fname_str = fname.decode("utf-8")
@@ -1056,6 +1233,21 @@ class ZipFile:
         for zipinfo in members:
             self.extract(zipinfo, path, pwd)
 
+    @classmethod
+    def _sanitize_windows_name(cls, arcname, pathsep):
+        """Replace bad characters and remove trailing dots from parts."""
+        table = cls._windows_illegal_name_trans_table
+        if not table:
+            illegal = ':<>|"?*'
+            table = str.maketrans(illegal, '_' * len(illegal))
+            cls._windows_illegal_name_trans_table = table
+        arcname = arcname.translate(table)
+        # remove trailing dots
+        arcname = (x.rstrip('.') for x in arcname.split(pathsep))
+        # rejoin, removing empty parts.
+        arcname = pathsep.join(x for x in arcname if x)
+        return arcname
+
     def _extract_member(self, member, targetpath, pwd):
         """Extract the ZipInfo object 'member' to a physical
            file on the path targetpath.
@@ -1069,16 +1261,12 @@ class ZipFile:
         # interpret absolute pathname as relative, remove drive letter or
         # UNC path, redundant separators, "." and ".." components.
         arcname = os.path.splitdrive(arcname)[1]
+        invalid_path_parts = ('', os.path.curdir, os.path.pardir)
         arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
-                    if x not in ('', os.path.curdir, os.path.pardir))
+                                   if x not in invalid_path_parts)
         if os.path.sep == '\\':
             # filter illegal characters on Windows
-            illegal = ':<>|"?*'
-            table = str.maketrans(illegal, '_' * len(illegal))
-            arcname = arcname.translate(table)
-            # remove trailing dots
-            arcname = (x.rstrip('.') for x in arcname.split(os.path.sep))
-            arcname = os.path.sep.join(x for x in arcname if x)
+            arcname = self._sanitize_windows_name(arcname, os.path.sep)
 
         targetpath = os.path.join(targetpath, arcname)
         targetpath = os.path.normpath(targetpath)
@@ -1102,18 +1290,14 @@ class ZipFile:
     def _writecheck(self, zinfo):
         """Check for errors before writing a file to the archive."""
         if zinfo.filename in self.NameToInfo:
-            if self.debug:      # Warning for duplicate names
-                print("Duplicate name:", zinfo.filename)
+            import warnings
+            warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
         if self.mode not in ("w", "a"):
             raise RuntimeError('write() requires mode "w" or "a"')
         if not self.fp:
             raise RuntimeError(
                   "Attempt to write ZIP archive that was already closed")
-        if zinfo.compress_type == ZIP_DEFLATED and not zlib:
-            raise RuntimeError(
-                  "Compression requires the (missing) zlib module")
-        if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
-            raise RuntimeError("That compression method is not supported")
+        _check_compression(zinfo.compress_type)
         if zinfo.file_size > ZIP64_LIMIT:
             if not self._allowZip64:
                 raise LargeZipFile("Filesize would require ZIP64 extensions")
@@ -1151,6 +1335,9 @@ class ZipFile:
         zinfo.file_size = st.st_size
         zinfo.flag_bits = 0x00
         zinfo.header_offset = self.fp.tell()    # Start of header bytes
+        if zinfo.compress_type == ZIP_LZMA:
+            # Compressed data includes an end-of-stream (EOS) marker
+            zinfo.flag_bits |= 0x02
 
         self._writecheck(zinfo)
         self._didModify = True
@@ -1164,6 +1351,7 @@ class ZipFile:
             self.fp.write(zinfo.FileHeader(False))
             return
 
+        cmpr = _get_compressor(zinfo.compress_type)
         with open(filename, "rb") as fp:
             # Must overwrite CRC and sizes with correct data later
             zinfo.CRC = CRC = 0
@@ -1172,11 +1360,6 @@ class ZipFile:
             zip64 = self._allowZip64 and \
                     zinfo.file_size * 1.05 > ZIP64_LIMIT
             self.fp.write(zinfo.FileHeader(zip64))
-            if zinfo.compress_type == ZIP_DEFLATED:
-                cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
-                     zlib.DEFLATED, -15)
-            else:
-                cmpr = None
             file_size = 0
             while 1:
                 buf = fp.read(1024 * 8)
@@ -1235,13 +1418,15 @@ class ZipFile:
         zinfo.header_offset = self.fp.tell()    # Start of header data
         if compress_type is not None:
             zinfo.compress_type = compress_type
+        if zinfo.compress_type == ZIP_LZMA:
+            # Compressed data includes an end-of-stream (EOS) marker
+            zinfo.flag_bits |= 0x02
 
         self._writecheck(zinfo)
         self._didModify = True
         zinfo.CRC = crc32(data) & 0xffffffff       # CRC-32 checksum
-        if zinfo.compress_type == ZIP_DEFLATED:
-            co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
-                 zlib.DEFLATED, -15)
+        co = _get_compressor(zinfo.compress_type)
+        if co:
             data = co.compress(data) + co.flush()
             zinfo.compress_size = len(data)    # Compressed size
         else:
@@ -1298,18 +1483,22 @@ class ZipFile:
                         header_offset = zinfo.header_offset
 
                     extra_data = zinfo.extra
+                    min_version = 0
                     if extra:
                         # Append a ZIP64 field to the extra's
                         extra_data = struct.pack(
                                 '<HH' + 'Q'*len(extra),
                                 1, 8*len(extra), *extra) + extra_data
 
-                        extract_version = max(45, zinfo.extract_version)
-                        create_version = max(45, zinfo.create_version)
-                    else:
-                        extract_version = zinfo.extract_version
-                        create_version = zinfo.create_version
+                        min_version = ZIP64_VERSION
+
+                    if zinfo.compress_type == ZIP_BZIP2:
+                        min_version = max(BZIP2_VERSION, min_version)
+                    elif zinfo.compress_type == ZIP_LZMA:
+                        min_version = max(LZMA_VERSION, min_version)
 
+                    extract_version = max(min_version, zinfo.extract_version)
+                    create_version = max(min_version, zinfo.create_version)
                     try:
                         filename, flag_bits = zinfo._encodeFilenameFlags()
                         centdir = struct.pack(structCentralDir,