summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py340
1 files changed, 148 insertions, 192 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index d12482cb7..b8b3fe877 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,7 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
-from operator import itemgetter, index as opindex
+from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
import numpy as np
@@ -14,7 +14,6 @@ from ._datasource import DataSource
from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
from numpy.core.overrides import set_array_function_like_doc, set_module
-from numpy.core._internal import recursive
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
ConverterLockError, ConversionWarning, _is_string_like,
@@ -27,18 +26,9 @@ from numpy.compat import (
)
-@set_module('numpy')
-def loads(*args, **kwargs):
- # NumPy 1.15.0, 2017-12-10
- warnings.warn(
- "np.loads is deprecated, use pickle.loads instead",
- DeprecationWarning, stacklevel=2)
- return pickle.loads(*args, **kwargs)
-
-
__all__ = [
- 'savetxt', 'loadtxt', 'genfromtxt', 'ndfromtxt', 'mafromtxt',
- 'recfromtxt', 'recfromcsv', 'load', 'loads', 'save', 'savez',
+ 'savetxt', 'loadtxt', 'genfromtxt',
+ 'recfromtxt', 'recfromcsv', 'load', 'save', 'savez',
'savez_compressed', 'packbits', 'unpackbits', 'fromregex', 'DataSource'
]
@@ -729,36 +719,100 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
+def _floatconv(x):
+ try:
+ return float(x) # The fastest path.
+ except ValueError:
+ if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
+ try:
+ return float.fromhex(x)
+ except ValueError:
+ pass
+ raise # Raise the original exception, which makes more sense.
+
+
+_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
+ (np.bool_, lambda x: bool(int(x))),
+ (np.uint64, np.uint64),
+ (np.int64, np.int64),
+ (np.integer, lambda x: int(float(x))),
+ (np.longdouble, np.longdouble),
+ (np.floating, _floatconv),
+ (complex, lambda x: complex(x.replace('+-', '-'))),
+ (np.bytes_, methodcaller('encode', 'latin-1')),
+ (np.unicode_, str),
+]
+
+
def _getconv(dtype):
- """ Find the correct dtype converter. Adapted from matplotlib """
-
- def floatconv(x):
- x.lower()
- if '0x' in x:
- return float.fromhex(x)
- return float(x)
-
- typ = dtype.type
- if issubclass(typ, np.bool_):
- return lambda x: bool(int(x))
- if issubclass(typ, np.uint64):
- return np.uint64
- if issubclass(typ, np.int64):
- return np.int64
- if issubclass(typ, np.integer):
- return lambda x: int(float(x))
- elif issubclass(typ, np.longdouble):
- return np.longdouble
- elif issubclass(typ, np.floating):
- return floatconv
- elif issubclass(typ, complex):
- return lambda x: complex(asstr(x).replace('+-', '-'))
- elif issubclass(typ, np.bytes_):
- return asbytes
- elif issubclass(typ, np.unicode_):
- return asunicode
+ """
+ Find the correct dtype converter. Adapted from matplotlib.
+
+ Even when a lambda is returned, it is defined at the toplevel, to allow
+ testing for equality and enabling optimization for single-type data.
+ """
+ for base, conv in _CONVERTERS:
+ if issubclass(dtype.type, base):
+ return conv
+ return str
+
+
+# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
+# lifted to the toplevel because recursive inner functions cause either
+# GC-dependent reference loops (because they are closures over loadtxt's
+# internal variables) or large overheads if using a manual trampoline to hide
+# the recursive calls.
+
+
+# not to be confused with the flatten_dtype we import...
+def _loadtxt_flatten_dtype_internal(dt):
+ """Unpack a structured data-type, and produce a packer function."""
+ if dt.names is None:
+ # If the dtype is flattened, return.
+ # If the dtype has a shape, the dtype occurs
+ # in the list more than once.
+ shape = dt.shape
+ if len(shape) == 0:
+ return ([dt.base], None)
+ else:
+ packing = [(shape[-1], list)]
+ if len(shape) > 1:
+ for dim in dt.shape[-2::-1]:
+ packing = [(dim*packing[0][0], packing*dim)]
+ return ([dt.base] * int(np.prod(dt.shape)),
+ functools.partial(_loadtxt_pack_items, packing))
+ else:
+ types = []
+ packing = []
+ for field in dt.names:
+ tp, bytes = dt.fields[field]
+ flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp)
+ types.extend(flat_dt)
+ flat_packing = flat_packer.args[0] if flat_packer else None
+ # Avoid extra nesting for subarrays
+ if tp.ndim > 0:
+ packing.extend(flat_packing)
+ else:
+ packing.append((len(flat_dt), flat_packing))
+ return (types, functools.partial(_loadtxt_pack_items, packing))
+
+
+def _loadtxt_pack_items(packing, items):
+ """Pack items into nested lists based on re-packing info."""
+ if packing is None:
+ return items[0]
+ elif packing is tuple:
+ return tuple(items)
+ elif packing is list:
+ return list(items)
else:
- return asstr
+ start = 0
+ ret = []
+ for length, subpacking in packing:
+ ret.append(
+ _loadtxt_pack_items(subpacking, items[start:start+length]))
+ start += length
+ return tuple(ret)
# amount of lines loadtxt reads in one chunk, can be overridden for testing
@@ -783,10 +837,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
Parameters
----------
- fname : file, str, or pathlib.Path
- File, filename, or generator to read. If the filename extension is
- ``.gz`` or ``.bz2``, the file is first decompressed. Note that
- generators should return byte strings.
+ fname : file, str, pathlib.Path, list of str, generator
+ File, filename, list, or generator to read. If the filename
+ extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
+ that generators must return bytes or strings. The strings
+ in a list or produced by a generator are treated as lines.
dtype : data-type, optional
Data-type of the resulting array; default: float. If this is a
structured data-type, the resulting array will be 1-dimensional, and
@@ -915,60 +970,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
# Nested functions used by loadtxt.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # not to be confused with the flatten_dtype we import...
- @recursive
- def flatten_dtype_internal(self, dt):
- """Unpack a structured data-type, and produce re-packing info."""
- if dt.names is None:
- # If the dtype is flattened, return.
- # If the dtype has a shape, the dtype occurs
- # in the list more than once.
- shape = dt.shape
- if len(shape) == 0:
- return ([dt.base], None)
- else:
- packing = [(shape[-1], list)]
- if len(shape) > 1:
- for dim in dt.shape[-2::-1]:
- packing = [(dim*packing[0][0], packing*dim)]
- return ([dt.base] * int(np.prod(dt.shape)), packing)
- else:
- types = []
- packing = []
- for field in dt.names:
- tp, bytes = dt.fields[field]
- flat_dt, flat_packing = self(tp)
- types.extend(flat_dt)
- # Avoid extra nesting for subarrays
- if tp.ndim > 0:
- packing.extend(flat_packing)
- else:
- packing.append((len(flat_dt), flat_packing))
- return (types, packing)
-
- @recursive
- def pack_items(self, items, packing):
- """Pack items into nested lists based on re-packing info."""
- if packing is None:
- return items[0]
- elif packing is tuple:
- return tuple(items)
- elif packing is list:
- return list(items)
- else:
- start = 0
- ret = []
- for length, subpacking in packing:
- ret.append(self(items[start:start+length], subpacking))
- start += length
- return tuple(ret)
-
def split_line(line):
"""Chop off comments, strip, and split at delimiter. """
line = _decode_line(line, encoding=encoding)
-
- if comments is not None:
- line = regex_comments.split(line, maxsplit=1)[0]
+ for comment in comments: # Much faster than using a single regex.
+ line = line.split(comment, 1)[0]
line = line.strip('\r\n')
return line.split(delimiter) if line else []
@@ -993,16 +999,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
continue
if usecols:
vals = [vals[j] for j in usecols]
- if len(vals) != N:
+ if len(vals) != ncols:
line_num = i + skiprows + 1
raise ValueError("Wrong number of columns at line %d"
% line_num)
-
- # Convert each value according to its column and store
- items = [conv(val) for (conv, val) in zip(converters, vals)]
-
- # Then pack it according to the dtype's nesting
- items = pack_items(items, packing)
+ # Convert each value according to its column, then pack it
+ # according to the dtype's nesting
+ items = packer(convert_row(vals))
X.append(items)
if len(X) > chunk_size:
yield X
@@ -1023,9 +1026,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
if isinstance(comments, (str, bytes)):
comments = [comments]
comments = [_decode_line(x) for x in comments]
- # Compile regex for comments beforehand
- comments = (re.escape(comment) for comment in comments)
- regex_comments = re.compile('|'.join(comments))
+ else:
+ comments = []
if delimiter is not None:
delimiter = _decode_line(delimiter)
@@ -1060,7 +1062,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
dtype = np.dtype(dtype)
defconv = _getconv(dtype)
- dtype_types, packing = flatten_dtype_internal(dtype)
+ dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype)
fown = False
try:
@@ -1076,7 +1078,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
fencoding = getattr(fname, 'encoding', 'latin1')
except TypeError as e:
raise ValueError(
- 'fname must be a string, file handle, or generator'
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead."
) from e
# input may be a python2 io stream
@@ -1093,32 +1096,32 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
for i in range(skiprows):
next(fh)
- # Read until we find a line with some values, and use
- # it to estimate the number of columns, N.
- first_vals = None
- try:
- while not first_vals:
- first_line = next(fh)
- first_vals = split_line(first_line)
- except StopIteration:
- # End of lines reached
+ # Read until we find a line with some values, and use it to determine
+ # the need for decoding and estimate the number of columns.
+ for first_line in fh:
+ ncols = len(usecols or split_line(first_line))
+ if ncols:
+ break
+ else: # End of lines reached
first_line = ''
- first_vals = []
+ ncols = len(usecols or [])
warnings.warn('loadtxt: Empty input file: "%s"' % fname,
stacklevel=2)
- N = len(usecols or first_vals)
- # Now that we know N, create the default converters list, and
+ # Now that we know ncols, create the default converters list, and
# set packing, if necessary.
if len(dtype_types) > 1:
# We're dealing with a structured array, each field of
# the dtype matches a column
converters = [_getconv(dt) for dt in dtype_types]
else:
- # All fields have the same dtype
- converters = [defconv for i in range(N)]
- if N > 1:
- packing = [(N, tuple)]
+ # All fields have the same dtype; use specialized packers which are
+ # much faster than those using _loadtxt_pack_items.
+ converters = [defconv for i in range(ncols)]
+ if ncols == 1:
+ packer = itemgetter(0)
+ else:
+ def packer(row): return row
# By preference, use the converters specified by the user
for i, conv in (user_converters or {}).items():
@@ -1130,18 +1133,26 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
continue
if byte_converters:
# converters may use decode to workaround numpy's old
- # behaviour, so encode the string again before passing to
- # the user converter
- def tobytes_first(x, conv):
- if type(x) is bytes:
- return conv(x)
+ # behaviour, so encode the string again (converters are only
+ # called with strings) before passing to the user converter.
+ def tobytes_first(conv, x):
return conv(x.encode("latin1"))
- converters[i] = functools.partial(tobytes_first, conv=conv)
+ converters[i] = functools.partial(tobytes_first, conv)
else:
converters[i] = conv
- converters = [conv if conv is not bytes else
- lambda x: x.encode(fencoding) for conv in converters]
+ fencode = methodcaller("encode", fencoding)
+ converters = [conv if conv is not bytes else fencode
+ for conv in converters]
+ if len(set(converters)) == 1:
+ # Optimize single-type data. Note that this is only reached if
+ # `_getconv` returns equal callables (i.e. not local lambdas) on
+ # equal dtypes.
+ def convert_row(vals, _conv=converters[0]):
+ return [*map(_conv, vals)]
+ else:
+ def convert_row(vals):
+ return [conv(val) for conv, val in zip(converters, vals)]
# read data in chunks and fill it into an array via resize
# over-allocating and shrinking the array later may be faster but is
@@ -1581,8 +1592,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
----------
fname : file, str, pathlib.Path, list of str, generator
File, filename, list, or generator to read. If the filename
- extension is `.gz` or `.bz2`, the file is first decompressed. Note
- that generators must return byte strings. The strings
+ extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
+ that generators must return bytes or strings. The strings
in a list or produced by a generator are treated as lines.
dtype : dtype, optional
Data type of the resulting array.
@@ -1801,8 +1812,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
fhd = iter(fid)
except TypeError as e:
raise TypeError(
- "fname must be a string, filehandle, list of strings, "
- "or generator. Got %s instead." % type(fname)) from e
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead."
+ ) from e
with fid_ctx:
split_line = LineSplitter(delimiter=delimiter, comments=comments,
@@ -2292,62 +2304,6 @@ _genfromtxt_with_like = array_function_dispatch(
)(genfromtxt)
-def ndfromtxt(fname, **kwargs):
- """
- Load ASCII data stored in a file and return it as a single array.
-
- .. deprecated:: 1.17
- ndfromtxt` is a deprecated alias of `genfromtxt` which
- overwrites the ``usemask`` argument with `False` even when
- explicitly called as ``ndfromtxt(..., usemask=True)``.
- Use `genfromtxt` instead.
-
- Parameters
- ----------
- fname, kwargs : For a description of input parameters, see `genfromtxt`.
-
- See Also
- --------
- numpy.genfromtxt : generic function.
-
- """
- kwargs['usemask'] = False
- # Numpy 1.17
- warnings.warn(
- "np.ndfromtxt is a deprecated alias of np.genfromtxt, "
- "prefer the latter.",
- DeprecationWarning, stacklevel=2)
- return genfromtxt(fname, **kwargs)
-
-
-def mafromtxt(fname, **kwargs):
- """
- Load ASCII data stored in a text file and return a masked array.
-
- .. deprecated:: 1.17
- np.mafromtxt is a deprecated alias of `genfromtxt` which
- overwrites the ``usemask`` argument with `True` even when
- explicitly called as ``mafromtxt(..., usemask=False)``.
- Use `genfromtxt` instead.
-
- Parameters
- ----------
- fname, kwargs : For a description of input parameters, see `genfromtxt`.
-
- See Also
- --------
- numpy.genfromtxt : generic function to load ASCII data.
-
- """
- kwargs['usemask'] = True
- # Numpy 1.17
- warnings.warn(
- "np.mafromtxt is a deprecated alias of np.genfromtxt, "
- "prefer the latter.",
- DeprecationWarning, stacklevel=2)
- return genfromtxt(fname, **kwargs)
-
-
def recfromtxt(fname, **kwargs):
"""
Load ASCII data from a file and return it in a record array.