MAINT: Introduced _read_array function

This wrapper function is used everywhere in format.py now to ensure to correctly the handle the case when fp.read returns fewer bytes than requested. Also added a test for the orignal bug, loading an array of size more than 64K from a zip file.
author: Max Sperlich <max.sperlich@livingsocial.com> 2013-12-11 21:35:50 -0500
committer: Max Sperlich <max.sperlich@livingsocial.com> 2013-12-11 21:35:50 -0500
commit: e09c5f040fa020bb47c6610356214c0477c206aa (patch)
tree: 37b2370e0cf5520c39ed231ccb5c93a5dbaade0b /numpy/lib
parent: bdb6f8cabf755d4d7b18a3d7e7475480ce6fc008 (diff)
download: numpy-e09c5f040fa020bb47c6610356214c0477c206aa.tar.gz
2 files changed, 45 insertions, 25 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 81366c50d..4cfbbe05d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -138,6 +138,7 @@ from __future__ import division, absolute_import, print_function
 
 import numpy
 import sys
+import io
 from numpy.lib.utils import safe_eval
 from numpy.compat import asbytes, isfileobj, long, basestring
 
@@ -187,10 +188,7 @@ def read_magic(fp):
     major : int
     minor : int
     """
-    magic_str = fp.read(MAGIC_LEN)
-    if len(magic_str) != MAGIC_LEN:
-        msg = "could not read %d characters for the magic string; got %r"
-        raise ValueError(msg % (MAGIC_LEN, magic_str))
+    magic_str = _read_bytes(fp, MAGIC_LEN, "magic string")
     if magic_str[:-2] != MAGIC_PREFIX:
         msg = "the magic string is not correct; expected %r, got %r"
         raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2]))
@@ -322,14 +320,9 @@ def read_array_header_1_0(fp):
     # Read an unsigned, little-endian short int which has the length of the
     # header.
     import struct
-    hlength_str = fp.read(2)
-    if len(hlength_str) != 2:
-        msg = "EOF at %s before reading array header length"
-        raise ValueError(msg % fp.tell())
+    hlength_str = _read_bytes(fp, 2, "array header length")
     header_length = struct.unpack('<H', hlength_str)[0]
-    header = fp.read(header_length)
-    if len(header) != header_length:
-        raise ValueError("EOF at %s before reading array header" % fp.tell())
+    header = _read_bytes(fp, header_length, "array header")
 
     # The header is a pretty-printed string representation of a literal Python
     # dictionary with trailing newlines padded to a 16-byte boundary. The keys
@@ -476,21 +469,12 @@ def read_array(fp):
             max_read_count = BUFFER_SIZE // min(BUFFER_SIZE, dtype.itemsize)
 
             array = numpy.empty(count, dtype=dtype)
-            extra_data = bytes()
-            i = 0
-            while i < count: 
+            for i in range(0, count, max_read_count):
                 read_count = min(max_read_count, count - i)
-                data = extra_data + fp.read(int(read_count * dtype.itemsize))
-                if len(data) == len(extra_data):
-                    #Unable to read sufficient data from fp
-                    msg = "EOF: expected %d entries, got %d entries" % (count, i)
-                    raise ValueError(msg)
-                actual_count = len(data) // dtype.itemsize
-                if actual_count > 0:
-                    array[i:i + actual_count] = \
-                        numpy.frombuffer(data, dtype=dtype, count=actual_count)
-                    i += actual_count
-                extra_data = data[actual_count * dtype.itemsize:]
+                read_size = int(read_count * dtype.itemsize)
+                data = _read_bytes(fp, read_size, "array data")
+                array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype,
+                                                         count=read_count)
 
         if fortran_order:
             array.shape = shape[::-1]
@@ -609,3 +593,31 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         mode=mode, offset=offset)
 
     return marray
+
+
+def _read_bytes(fp, size, error_template="ran out of data"):
+    """
+    Read from file-like object until size bytes are read.
+    Raises ValueError if not EOF is encountered before size bytes are read.
+    Non-blocking objects only supported if they derive from io objects.
+
+    Required as e.g. ZipExtFile in python 2.6 can return less data than
+    requested.
+    """
+    data = bytes()
+    while True:
+        # io files (default in python3) return None or raise on would-block,
+        # python2 file will truncate, probably nothing can be done about that.
+        # note that regular files can't be non-blocking
+        try:
+            r = fp.read(size - len(data))
+            data += r
+            if len(r) == 0 or len(data) == size:
+                break
+        except io.BlockingIOError:
+            pass
+    if len(data) != size:
+        msg = "EOF: reading %s, expected %d bytes got %d"
+        raise ValueError(msg %(error_template, size, len(data)))
+    else:
+        return data
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index abb93fbd5..dbcdaaaa6 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -517,6 +517,14 @@ def test_memmap_roundtrip():
             del ma
 
 
+def test_compressed_roundtrip():
+    arr = np.random.rand(200, 200)
+    npz_file = os.path.join(tempdir, 'compressed.npz')
+    np.savez_compressed(npz_file, arr=arr)
+    arr1 = np.load(npz_file)['arr']
+    assert_array_equal(arr, arr1)
+
+
 def test_write_version_1_0():
     f = BytesIO()
     arr = np.arange(1)
author	Max Sperlich <max.sperlich@livingsocial.com>	2013-12-11 21:35:50 -0500
committer	Max Sperlich <max.sperlich@livingsocial.com>	2013-12-11 21:35:50 -0500
commit	e09c5f040fa020bb47c6610356214c0477c206aa (patch)
tree	37b2370e0cf5520c39ed231ccb5c93a5dbaade0b /numpy/lib
parent	bdb6f8cabf755d4d7b18a3d7e7475480ce6fc008 (diff)
download	numpy-e09c5f040fa020bb47c6610356214c0477c206aa.tar.gz