From d8edc62e8c9e69280fb8a171c7678b2fea929696 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Mon, 3 Apr 2017 14:20:36 +0200 Subject: ENH: Add encoding option to numpy text IO. This modifies loadtxt and genfromtxt in several ways intended to add unicode support for text files by adding an `encoding` keyword to np.load, np.genfromtxt, np.savetxt, and np.fromregex. The original treatment of the relevant files was to open them as byte files, whereas they are now opened as text files with an encoding. When read, they are decoded to unicode strings for Python3 compatibility, and when written, they are encoded as specified. For backward compatibility, the default encoding in both cases is latin1. --- numpy/lib/_datasource.py | 88 +++++++-- numpy/lib/_iotools.py | 62 +++---- numpy/lib/npyio.py | 346 ++++++++++++++++++++++++---------- numpy/lib/tests/test__iotools.py | 123 ++++++------ numpy/lib/tests/test_io.py | 392 +++++++++++++++++++++++++++++++++++---- 5 files changed, 771 insertions(+), 240 deletions(-) (limited to 'numpy/lib') diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py index 3affc5195..ad939df3f 100644 --- a/numpy/lib/_datasource.py +++ b/numpy/lib/_datasource.py @@ -15,7 +15,7 @@ DataSource files can originate locally or remotely: - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' DataSource files can also be compressed or uncompressed. Currently only -gzip and bz2 are supported. +gzip, bz2 and xz are supported. Example:: @@ -38,13 +38,59 @@ from __future__ import division, absolute_import, print_function import os import sys import shutil +import io _open = open +def _check_mode(mode, encoding, newline): + if "t" in mode: + if "b" in mode: + raise ValueError("Invalid mode: %r" % (mode,)) + else: + if encoding is not None: + raise ValueError("Argument 'encoding' not supported in binary mode") + if newline is not None: + raise ValueError("Argument 'newline' not supported in binary mode") + +def _python2_bz2open(fn, mode, encoding, newline): + """ wrapper to open bz2 in text mode """ + import bz2 + + _check_mode(mode, encoding, newline) + + if "t" in mode: + # BZ2File is missing necessary functions for TextIOWrapper + raise ValueError("bz2 text files not supported in python2") + else: + return bz2.BZ2File(fn, mode) + +def _python2_gzipopen(fn, mode, encoding, newline): + """ wrapper to open gzip in text mode """ + import gzip + # gzip is lacking read1 needed for TextIOWrapper + class GzipWrap(gzip.GzipFile): + def read1(self, n): + return self.read(n) + + _check_mode(mode, encoding, newline) + + gz_mode = mode.replace("t", "") + if isinstance(fn, (str, bytes)): + binary_file = GzipWrap(fn, gz_mode) + elif hasattr(fn, "read") or hasattr(fn, "write"): + binary_file = GzipWrap(None, gz_mode, fileobj=fn) + else: + raise TypeError("filename must be a str or bytes object, or a file") + + if "t" in mode: + return io.TextIOWrapper(binary_file, encoding, newline=newline) + else: + return binary_file + # Using a class instead of a module-level dictionary # to reduce the initial 'import numpy' overhead by -# deferring the import of bz2 and gzip until needed +# deferring the import of lzma, bz2 and gzip until needed # TODO: .zip support, .tar support? class _FileOpeners(object): @@ -55,7 +101,7 @@ class _FileOpeners(object): supported file format. Attribute lookup is implemented in such a way that an instance of `_FileOpeners` itself can be indexed with the keys of that dictionary. Currently uncompressed files as well as files - compressed with ``gzip`` or ``bz2`` compression are supported. + compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. Notes ----- @@ -65,7 +111,7 @@ class _FileOpeners(object): Examples -------- >>> np.lib._datasource._file_openers.keys() - [None, '.bz2', '.gz'] + [None, '.bz2', '.gz', '.xz', '.lzma'] >>> np.lib._datasource._file_openers['.gz'] is gzip.open True @@ -73,19 +119,31 @@ class _FileOpeners(object): def __init__(self): self._loaded = False - self._file_openers = {None: open} + self._file_openers = {None: io.open} def _load(self): if self._loaded: return try: import bz2 - self._file_openers[".bz2"] = bz2.BZ2File + if sys.version_info[0] >= 3: + self._file_openers[".bz2"] = bz2.open + else: + self._file_openers[".bz2"] = _python2_bz2open except ImportError: pass try: import gzip - self._file_openers[".gz"] = gzip.open + if sys.version_info[0] >= 3: + self._file_openers[".gz"] = gzip.open + else: + self._file_openers[".gz"] = _python2_gzipopen + except ImportError: + pass + try: + import lzma + self._file_openers[".xz"] = lzma.open + self._file_openers[".lzma"] = lzma.open except ImportError: pass self._loaded = True @@ -102,7 +160,7 @@ class _FileOpeners(object): ------- keys : list The keys are None for uncompressed files and the file extension - strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression + strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression methods. """ @@ -115,7 +173,7 @@ class _FileOpeners(object): _file_openers = _FileOpeners() -def open(path, mode='r', destpath=os.curdir): +def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): """ Open `path` with `mode` and return the file object. @@ -148,7 +206,7 @@ def open(path, mode='r', destpath=os.curdir): """ ds = DataSource(destpath) - return ds.open(path, mode) + return ds.open(path, mode, encoding=encoding, newline=newline) class DataSource (object): @@ -458,7 +516,7 @@ class DataSource (object): return False return False - def open(self, path, mode='r'): + def open(self, path, mode='r', encoding=None, newline=None): """ Open and return file-like object. @@ -496,7 +554,8 @@ class DataSource (object): _fname, ext = self._splitzipext(found) if ext == 'bz2': mode.replace("+", "") - return _file_openers[ext](found, mode=mode) + return _file_openers[ext](found, mode=mode, + encoding=encoding, newline=newline) else: raise IOError("%s not found." % path) @@ -619,7 +678,7 @@ class Repository (DataSource): """ return DataSource.exists(self, self._fullpath(path)) - def open(self, path, mode='r'): + def open(self, path, mode='r', encoding=None, newline=None): """ Open and return file-like object prepending Repository base URL. @@ -643,7 +702,8 @@ class Repository (DataSource): File object. """ - return DataSource.open(self, self._fullpath(path), mode) + return DataSource.open(self, self._fullpath(path), mode, + encoding=encoding, newline=newline) def listdir(self): """ diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 1874c2e97..8e091d42d 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en" import sys import numpy as np import numpy.core.numeric as nx -from numpy.compat import asbytes, bytes, asbytes_nested, basestring +from numpy.compat import asbytes, asunicode, bytes, asbytes_nested, basestring if sys.version_info[0] >= 3: from builtins import bool, int, float, complex, object, str @@ -17,15 +17,15 @@ else: from __builtin__ import bool, int, float, complex, object, unicode, str -if sys.version_info[0] >= 3: - def _bytes_to_complex(s): - return complex(s.decode('ascii')) +def _decode_line(line, encoding=None): + """ decode bytes from binary input streams, default to latin1 """ + if type(line) is bytes: + if encoding is None: + line = line.decode('latin1') + else: + line = line.decode(encoding) - def _bytes_to_name(s): - return s.decode('ascii') -else: - _bytes_to_complex = complex - _bytes_to_name = str + return line def _is_string_like(obj): @@ -189,12 +189,10 @@ class LineSplitter(object): return lambda input: [_.strip() for _ in method(input)] # - def __init__(self, delimiter=None, comments=b'#', autostrip=True): + def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None): self.comments = comments # Delimiter is a character - if isinstance(delimiter, unicode): - delimiter = delimiter.encode('ascii') - if (delimiter is None) or _is_bytes_like(delimiter): + if (delimiter is None) or isinstance(delimiter, basestring): delimiter = delimiter or None _handyman = self._delimited_splitter # Delimiter is a list of field widths @@ -213,12 +211,14 @@ class LineSplitter(object): self._handyman = self.autostrip(_handyman) else: self._handyman = _handyman + self.encoding = encoding # def _delimited_splitter(self, line): + """Chop off comments, strip, and split at delimiter. """ if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(b" \r\n") + line = line.strip(" \r\n") if not line: return [] return line.split(self.delimiter) @@ -227,7 +227,7 @@ class LineSplitter(object): def _fixedwidth_splitter(self, line): if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(b"\r\n") + line = line.strip("\r\n") if not line: return [] fixed = self.delimiter @@ -245,7 +245,7 @@ class LineSplitter(object): # def __call__(self, line): - return self._handyman(line) + return self._handyman(_decode_line(line, self.encoding)) class NameValidator(object): @@ -434,9 +434,9 @@ def str2bool(value): """ value = value.upper() - if value == b'TRUE': + if value == 'TRUE': return True - elif value == b'FALSE': + elif value == 'FALSE': return False else: raise ValueError("Invalid boolean") @@ -527,9 +527,10 @@ class StringConverter(object): _mapper.append((nx.int64, int, -1)) _mapper.extend([(nx.floating, float, nx.nan), - (nx.complexfloating, _bytes_to_complex, nx.nan + 0j), + (nx.complexfloating, complex, nx.nan + 0j), (nx.longdouble, nx.longdouble, nx.nan), - (nx.string_, bytes, b'???')]) + (nx.unicode_, asunicode, '???'), + (nx.string_, asbytes, '???')]) (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper) @@ -601,11 +602,6 @@ class StringConverter(object): def __init__(self, dtype_or_func=None, default=None, missing_values=None, locked=False): - # Convert unicode (for Py3) - if isinstance(missing_values, unicode): - missing_values = asbytes(missing_values) - elif isinstance(missing_values, (list, tuple)): - missing_values = asbytes_nested(missing_values) # Defines a lock for upgrade self._locked = bool(locked) # No input dtype: minimal initialization @@ -631,7 +627,7 @@ class StringConverter(object): # None if default is None: try: - default = self.func(b'0') + default = self.func('0') except ValueError: default = None dtype = self._getdtype(default) @@ -676,11 +672,11 @@ class StringConverter(object): self.func = lambda x: int(float(x)) # Store the list of strings corresponding to missing values. if missing_values is None: - self.missing_values = set([b'']) + self.missing_values = set(['']) else: - if isinstance(missing_values, bytes): - missing_values = missing_values.split(b",") - self.missing_values = set(list(missing_values) + [b'']) + if isinstance(missing_values, basestring): + missing_values = missing_values.split(",") + self.missing_values = set(list(missing_values) + ['']) # self._callingfunction = self._strict_call self.type = self._dtypeortype(dtype) @@ -801,7 +797,7 @@ class StringConverter(object): self.iterupgrade(value) def update(self, func, default=None, testing_value=None, - missing_values=b'', locked=False): + missing_values='', locked=False): """ Set StringConverter attributes directly. @@ -838,13 +834,13 @@ class StringConverter(object): self.type = self._dtypeortype(self._getdtype(default)) else: try: - tester = func(testing_value or b'1') + tester = func(testing_value or '1') except (TypeError, ValueError): tester = None self.type = self._dtypeortype(self._getdtype(tester)) # Add the missing values to the existing set if missing_values is not None: - if _is_bytes_like(missing_values): + if isinstance(missing_values, basestring): self.missing_values.add(missing_values) elif hasattr(missing_values, '__iter__'): for val in missing_values: diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6de5940d7..fe2aa436b 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1,5 +1,6 @@ from __future__ import division, absolute_import, print_function +import io import sys import os import re @@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, ConverterLockError, ConversionWarning, _is_string_like, - has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name + has_nested_fields, flatten_dtype, easy_dtype, _decode_line ) from numpy.compat import ( - asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path + asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode, + is_pathlib_path ) if sys.version_info[0] >= 3: @@ -731,7 +733,7 @@ def _getconv(dtype): def floatconv(x): x.lower() - if b'0x' in x: + if '0x' in x: return float.fromhex(asstr(x)) return float(x) @@ -752,13 +754,17 @@ def _getconv(dtype): return lambda x: complex(asstr(x)) elif issubclass(typ, np.bytes_): return asbytes + elif issubclass(typ, np.unicode_): + return asunicode else: return asstr +# amount of lines loadtxt reads in one chunk, can be overriden for testing +_loadtxt_chunksize = 50000 def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0): + ndmin=0, encoding='bytes'): """ Load data from a text file. @@ -813,6 +819,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Legal values: 0 (default), 1 or 2. .. versionadded:: 1.6.0 + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + The special value 'bytes' enables backward compatibility workarounds + that ensures you receive byte arrays as results if possible and passes + latin1 encoded strings to converters. Override this value to receive + unicode arrays and pass strings as input to converters. + If set to None the system default is used. + + .. versionadded:: 1.14.0 Returns ------- @@ -861,16 +876,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, # Type conversions for Py3 convenience if comments is not None: if isinstance(comments, (basestring, bytes)): - comments = [asbytes(comments)] - else: - comments = [asbytes(comment) for comment in comments] + comments = [comments] + + comments = [_decode_line(x) for x in comments] # Compile regex for comments beforehand comments = (re.escape(comment) for comment in comments) - regex_comments = re.compile(b'|'.join(comments)) + regex_comments = re.compile('|'.join(comments)) user_converters = converters - if delimiter is not None: - delimiter = asbytes(delimiter) + + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False if usecols is not None: # Allow usecols to be a single int or a sequence of ints @@ -896,22 +915,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, if is_pathlib_path(fname): fname = str(fname) if _is_string_like(fname): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + fencoding = getattr(fh, 'encoding', 'latin1') + fh = iter(fh) fown = True - if fname.endswith('.gz'): - import gzip - fh = iter(gzip.GzipFile(fname)) - elif fname.endswith('.bz2'): - import bz2 - fh = iter(bz2.BZ2File(fname)) - elif sys.version_info[0] == 2: - fh = iter(open(fname, 'U')) - else: - fh = iter(open(fname)) else: fh = iter(fname) + fencoding = getattr(fname, 'encoding', 'latin1') except TypeError: raise ValueError('fname must be a string, file handle, or generator') - X = [] + + # input may be a python2 io stream + if encoding is not None: + fencoding = encoding + # we must assume local encoding + # TOOD emit portability warning? + elif fencoding is None: + import locale + fencoding = locale.getpreferredencoding() # not to be confused with the flatten_dtype we import... def flatten_dtype_internal(dt): @@ -960,21 +981,43 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, return tuple(ret) def split_line(line): - """Chop off comments, strip, and split at delimiter. - - Note that although the file is opened as text, this function - returns bytes. + """Chop off comments, strip, and split at delimiter. """ + line = _decode_line(line, encoding=encoding) - """ - line = asbytes(line) if comments is not None: - line = regex_comments.split(asbytes(line), maxsplit=1)[0] - line = line.strip(b'\r\n') + line = regex_comments.split(line, maxsplit=1)[0] + line = line.strip('\r\n') if line: return line.split(delimiter) else: return [] + def read_data(chunk_size): + # Parse each line, including the first + X = [] + for i, line in enumerate(itertools.chain([first_line], fh)): + vals = split_line(line) + if len(vals) == 0: + continue + if usecols: + vals = [vals[j] for j in usecols] + if len(vals) != N: + line_num = i + skiprows + 1 + raise ValueError("Wrong number of columns at line %d" + % line_num) + + # Convert each value according to its column and store + items = [conv(val) for (conv, val) in zip(converters, vals)] + + # Then pack it according to the dtype's nesting + items = pack_items(items, packing) + X.append(items) + if len(X) > chunk_size: + yield X + X = [] + if X: + yield X + try: # Make sure we're dealing with a proper dtype dtype = np.dtype(dtype) @@ -1017,30 +1060,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, except ValueError: # Unused converter specified continue - converters[i] = conv - - # Parse each line, including the first - for i, line in enumerate(itertools.chain([first_line], fh)): - vals = split_line(line) - if len(vals) == 0: - continue - if usecols: - vals = [vals[j] for j in usecols] - if len(vals) != N: - line_num = i + skiprows + 1 - raise ValueError("Wrong number of columns at line %d" - % line_num) - - # Convert each value according to its column and store - items = [conv(val) for (conv, val) in zip(converters, vals)] - # Then pack it according to the dtype's nesting - items = pack_items(items, packing) - X.append(items) + if byte_converters: + # converters may use decode to workaround numpy's oldd behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + import functools + converters[i] = functools.partial(tobytes_first, conv=conv) + else: + converters[i] = conv + + converters = [conv if conv is not bytes else + lambda x: x.encode(fencoding) for conv in converters] + + # read data in chunks and fill it into an array via resize + # over-allocating and shrinking the array later may be faster but is + # probably not relevant compared to the cost of actually reading and + # converting the data + X = None + for x in read_data(_loadtxt_chunksize): + if X is None: + X = np.array(x, dtype) + else: + nshape = list(X.shape) + pos = nshape[0] + nshape[0] += len(x) + X.resize(nshape) + X[pos:, ...] = x finally: if fown: fh.close() - X = np.array(X, dtype) + if X is None: + X = np.array([], dtype) + # Multicolumn data are returned with shape (1, N, M), i.e. # (1, 1, M) for a single row - remove the singleton dimension there if X.ndim == 3 and X.shape[:2] == (1, 1): @@ -1072,7 +1127,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', - footer='', comments='# '): + footer='', comments='# ', encoding=None): """ Save an array to a text file. @@ -1116,6 +1171,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', ``numpy.loadtxt``. .. versionadded:: 1.7.0 + encoding: string, optional + Encoding used to encode the outputfile. Does not apply to output + streams. + + .. versionadded:: 1.14.0 See Also @@ -1190,21 +1250,51 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', fmt = asstr(fmt) delimiter = asstr(delimiter) + class WriteWrap(object): + """ convert to unicode in py2 or to bytes on bytestream inputs """ + def __init__(self, fh, encoding): + self.fh = fh + self.encoding = encoding + self.do_write = self.first_write + + def close(self): + self.fh.close() + + def write(self, v): + self.do_write(v) + + def write_bytes(self, v): + if isinstance(v, bytes): + self.fh.write(v) + else: + self.fh.write(v.encode(self.encoding)) + + def write_normal(self, v): + self.fh.write(asunicode(v)) + + def first_write(self, v): + try: + self.write_normal(v) + self.write = self.write_normal + except TypeError: + # input is probably a bytestream + self.write_bytes(v) + self.write = self.write_bytes + own_fh = False if is_pathlib_path(fname): fname = str(fname) if _is_string_like(fname): + # datasource doesn't support creating a new file ... + open(fname, 'wt').close() + fh = np.lib._datasource.open(fname, 'wt', encoding=encoding) own_fh = True - if fname.endswith('.gz'): - import gzip - fh = gzip.open(fname, 'wb') - else: - if sys.version_info[0] >= 3: - fh = open(fname, 'wb') - else: - fh = open(fname, 'w') + # need to convert str to unicode for text io output + if sys.version_info[0] == 2: + fh = WriteWrap(fh, encoding or 'latin1') elif hasattr(fname, 'write'): - fh = fname + # wrap to handle byte output streams + fh = WriteWrap(fname, encoding or 'latin1') else: raise ValueError('fname must be a string or file handle') @@ -1254,31 +1344,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', if len(header) > 0: header = header.replace('\n', '\n' + comments) - fh.write(asbytes(comments + header + newline)) + fh.write(comments + header + newline) if iscomplex_X: for row in X: row2 = [] for number in row: row2.append(number.real) row2.append(number.imag) - fh.write(asbytes(format % tuple(row2) + newline)) + fh.write(format % tuple(row2) + newline) else: for row in X: try: - fh.write(asbytes(format % tuple(row) + newline)) + v = format % tuple(row) + newline except TypeError: raise TypeError("Mismatch between array dtype ('%s') and " "format specifier ('%s')" % (str(X.dtype), format)) + fh.write(v) + if len(footer) > 0: footer = footer.replace('\n', '\n' + comments) - fh.write(asbytes(comments + footer + newline)) + fh.write(comments + footer + newline) finally: if own_fh: fh.close() -def fromregex(file, regexp, dtype): +def fromregex(file, regexp, dtype, encoding=None): """ Construct an array from a text file, using regular expression parsing. @@ -1295,6 +1387,10 @@ def fromregex(file, regexp, dtype): Groups in the regular expression correspond to fields in the dtype. dtype : dtype or list of dtypes Dtype for the structured array. + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + + .. versionadded:: 1.14.0 Returns ------- @@ -1335,16 +1431,22 @@ def fromregex(file, regexp, dtype): """ own_fh = False if not hasattr(file, "read"): - file = open(file, 'rb') + file = np.lib._datasource.open(file, 'rt', encoding=encoding) own_fh = True try: - if not hasattr(regexp, 'match'): - regexp = re.compile(asbytes(regexp)) if not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) - seq = regexp.findall(file.read()) + content = file.read() + if isinstance(content, bytes) and not isinstance(regexp, bytes): + regexp = asbytes(regexp) + elif not isinstance(content, bytes) and isinstance(regexp, bytes): + regexp = asstr(regexp) + + if not hasattr(regexp, 'match'): + regexp = re.compile(regexp) + seq = regexp.findall(content) if seq and not isinstance(seq[0], tuple): # Only one group is in the regexp. # Create the new array as a single data-type and then @@ -1372,7 +1474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, names=None, excludelist=None, deletechars=None, replace_space='_', autostrip=False, case_sensitive=True, defaultfmt="f%i", unpack=None, usemask=False, loose=True, - invalid_raise=True, max_rows=None): + invalid_raise=True, max_rows=None, encoding='bytes'): """ Load data from a text file, with missing values handled as specified. @@ -1460,6 +1562,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, to read the entire file. .. versionadded:: 1.10.0 + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + The special value 'bytes' enables backward compatibility workarounds + that ensures you receive byte arrays as results if possible and passes + latin1 encoded strings to converters. Override this value to receive + unicode arrays and pass strings as input to converters. + If set to None the system default is used. + + .. versionadded:: 1.14.0 Returns ------- @@ -1536,15 +1647,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if max_rows < 1: raise ValueError("'max_rows' must be at least 1.") - # Py3 data conversions to bytes, for convenience - if comments is not None: - comments = asbytes(comments) - if isinstance(delimiter, unicode): - delimiter = asbytes(delimiter) - if isinstance(missing_values, (unicode, list, tuple)): - missing_values = asbytes_nested(missing_values) - - # if usemask: from numpy.ma import MaskedArray, make_mask_descr # Check the input dictionary of converters @@ -1554,16 +1656,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, "The input argument 'converter' should be a valid dictionary " "(got '%s' instead)" % type(user_converters)) + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False + # Initialize the filehandle, the LineSplitter and the NameValidator own_fhd = False try: if is_pathlib_path(fname): fname = str(fname) if isinstance(fname, basestring): - if sys.version_info[0] == 2: - fhd = iter(np.lib._datasource.open(fname, 'rbU')) - else: - fhd = iter(np.lib._datasource.open(fname, 'rb')) + fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding)) own_fhd = True else: fhd = iter(fname) @@ -1573,7 +1678,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, "or generator. Got %s instead." % type(fname)) split_line = LineSplitter(delimiter=delimiter, comments=comments, - autostrip=autostrip)._handyman + autostrip=autostrip, encoding=encoding) validate_names = NameValidator(excludelist=excludelist, deletechars=deletechars, case_sensitive=case_sensitive, @@ -1587,15 +1692,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, first_values = None try: while not first_values: - first_line = next(fhd) + first_line = _decode_line(next(fhd), encoding) if names is True: if comments in first_line: first_line = ( - b''.join(first_line.split(comments)[1:])) + ''.join(first_line.split(comments)[1:])) first_values = split_line(first_line) except StopIteration: # return an empty array if the datafile is empty - first_line = b'' + first_line = '' first_values = [] warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2) @@ -1618,9 +1723,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Check the names and overwrite the dtype.names if needed if names is True: - names = validate_names([_bytes_to_name(_.strip()) - for _ in first_values]) - first_line = b'' + names = validate_names([str(_.strip()) for _ in first_values]) + first_line = '' elif _is_string_like(names): names = validate_names([_.strip() for _ in names.split(',')]) elif names: @@ -1657,9 +1761,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Process the missing values ............................... # Rename missing_values for convenience user_missing_values = missing_values or () + if isinstance(user_missing_values, bytes): + user_missing_values = user_missing_values.decode('latin1') # Define the list of missing_values (one column: one list) - missing_values = [list([b'']) for _ in range(nbcols)] + missing_values = [list(['']) for _ in range(nbcols)] # We have a dictionary: process it field by field if isinstance(user_missing_values, dict): @@ -1698,8 +1804,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if value not in entry: entry.append(value) # We have a string : apply it to all entries - elif isinstance(user_missing_values, bytes): - user_value = user_missing_values.split(b",") + elif isinstance(user_missing_values, basestring): + user_value = user_missing_values.split(",") for entry in missing_values: entry.extend(user_value) # We have something else: apply it to all entries @@ -1787,11 +1893,24 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, testing_value = first_values[j] else: testing_value = None - converters[i].update(conv, locked=True, + if conv is bytes: + user_conv = asbytes + elif byte_converters: + # converters may use decode to workaround numpy's oldd behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + import functools + user_conv = functools.partial(tobytes_first, conv=conv) + else: + user_conv = conv + converters[i].update(user_conv, locked=True, testing_value=testing_value, default=filling_values[i], missing_values=missing_values[i],) - uc_update.append((i, conv)) + uc_update.append((i, user_conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) @@ -1908,16 +2027,43 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, column_types = [conv.type for conv in converters] # Find the columns with strings... strcolidx = [i for (i, v) in enumerate(column_types) - if v in (type('S'), np.string_)] + if v == np.unicode_] + + typestr = 'U' + if byte_converters and strcolidx: + # convert strings back to bytes for backward compatibility + warnings.warn( + "Reading strings without specifying the encoding argument is " + "deprecated. Set encoding, use None for the system default.", + np.VisibleDeprecationWarning, stacklevel=2) + try: + for j in range(len(data)): + row = list(data[j]) + for i in strcolidx: + row[i] = row[i].encode('latin1') + data[j] = tuple(row) + typestr = 'S' + except UnicodeEncodeError: + # we must use unicode, revert encoding + for k in range(0, j + 1): + row = list(data[k]) + for i in strcolidx: + if isinstance(row[i], bytes): + row[i] = row[i].decode('latin1') + data[k] = tuple(row) + # ... and take the largest number of chars. for i in strcolidx: - column_types[i] = "|S%i" % max(len(row[i]) for row in data) + column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data)) # if names is None: # If the dtype is uniform, don't define names, else use '' base = set([c.type for c in converters if c._checked]) if len(base) == 1: - (ddtype, mdtype) = (list(base)[0], bool) + if strcolidx: + (ddtype, mdtype) = (typestr, bool) + else: + (ddtype, mdtype) = (list(base)[0], bool) else: ddtype = [(defaultfmt % i, dt) for (i, dt) in enumerate(column_types)] @@ -1966,8 +2112,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Keep the dtype of the current converter if i in user_converters: ishomogeneous &= (ttype == dtype.type) - if ttype == np.string_: - ttype = "|S%i" % max(len(row[i]) for row in data) + if np.issubdtype(ttype, np.character): + ttype = (ttype, max(len(row[i]) for row in data)) descr.append(('', ttype)) else: descr.append(('', dtype)) @@ -1992,7 +2138,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if usemask and names: for (name, conv) in zip(names or (), converters): missing_values = [conv(_) for _ in conv.missing_values - if _ != b''] + if _ != ''] for mval in missing_values: outputmask[name] |= (output[name] == mval) # Construct the final array diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py index 03192896c..990ee126d 100644 --- a/numpy/lib/tests/test__iotools.py +++ b/numpy/lib/tests/test__iotools.py @@ -19,61 +19,61 @@ class TestLineSplitter(object): def test_no_delimiter(self): "Test LineSplitter w/o delimiter" - strg = b" 1 2 3 4 5 # test" + strg = " 1 2 3 4 5 # test" test = LineSplitter()(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'5']) + assert_equal(test, ['1', '2', '3', '4', '5']) test = LineSplitter('')(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'5']) + assert_equal(test, ['1', '2', '3', '4', '5']) def test_space_delimiter(self): "Test space delimiter" - strg = b" 1 2 3 4 5 # test" - test = LineSplitter(b' ')(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5']) - test = LineSplitter(b' ')(strg) - assert_equal(test, [b'1 2 3 4', b'5']) + strg = " 1 2 3 4 5 # test" + test = LineSplitter(' ')(strg) + assert_equal(test, ['1', '2', '3', '4', '', '5']) + test = LineSplitter(' ')(strg) + assert_equal(test, ['1 2 3 4', '5']) def test_tab_delimiter(self): "Test tab delimiter" - strg = b" 1\t 2\t 3\t 4\t 5 6" - test = LineSplitter(b'\t')(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'5 6']) - strg = b" 1 2\t 3 4\t 5 6" - test = LineSplitter(b'\t')(strg) - assert_equal(test, [b'1 2', b'3 4', b'5 6']) + strg = " 1\t 2\t 3\t 4\t 5 6" + test = LineSplitter('\t')(strg) + assert_equal(test, ['1', '2', '3', '4', '5 6']) + strg = " 1 2\t 3 4\t 5 6" + test = LineSplitter('\t')(strg) + assert_equal(test, ['1 2', '3 4', '5 6']) def test_other_delimiter(self): "Test LineSplitter on delimiter" - strg = b"1,2,3,4,,5" - test = LineSplitter(b',')(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5']) + strg = "1,2,3,4,,5" + test = LineSplitter(',')(strg) + assert_equal(test, ['1', '2', '3', '4', '', '5']) # - strg = b" 1,2,3,4,,5 # test" - test = LineSplitter(b',')(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5']) + strg = " 1,2,3,4,,5 # test" + test = LineSplitter(',')(strg) + assert_equal(test, ['1', '2', '3', '4', '', '5']) def test_constant_fixed_width(self): "Test LineSplitter w/ fixed-width fields" - strg = b" 1 2 3 4 5 # test" + strg = " 1 2 3 4 5 # test" test = LineSplitter(3)(strg) - assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5', b'']) + assert_equal(test, ['1', '2', '3', '4', '', '5', '']) # - strg = b" 1 3 4 5 6# test" + strg = " 1 3 4 5 6# test" test = LineSplitter(20)(strg) - assert_equal(test, [b'1 3 4 5 6']) + assert_equal(test, ['1 3 4 5 6']) # - strg = b" 1 3 4 5 6# test" + strg = " 1 3 4 5 6# test" test = LineSplitter(30)(strg) - assert_equal(test, [b'1 3 4 5 6']) + assert_equal(test, ['1 3 4 5 6']) def test_variable_fixed_width(self): - strg = b" 1 3 4 5 6# test" + strg = " 1 3 4 5 6# test" test = LineSplitter((3, 6, 6, 3))(strg) - assert_equal(test, [b'1', b'3', b'4 5', b'6']) + assert_equal(test, ['1', '3', '4 5', '6']) # - strg = b" 1 3 4 5 6# test" + strg = " 1 3 4 5 6# test" test = LineSplitter((6, 6, 9))(strg) - assert_equal(test, [b'1', b'3 4', b'5 6']) + assert_equal(test, ['1', '3 4', '5 6']) # ----------------------------------------------------------------------------- @@ -133,10 +133,9 @@ class TestNameValidator(object): def _bytes_to_date(s): - if sys.version_info[0] >= 3: - return date(*time.strptime(s.decode('latin1'), "%Y-%m-%d")[:3]) - else: - return date(*time.strptime(s, "%Y-%m-%d")[:3]) + if type(s) == bytes: + s = s.decode("latin1") + return date(*time.strptime(s, "%Y-%m-%d")[:3]) class TestStringConverter(object): @@ -155,7 +154,7 @@ class TestStringConverter(object): assert_equal(converter._status, 0) # test int - assert_equal(converter.upgrade(b'0'), 0) + assert_equal(converter.upgrade('0'), 0) assert_equal(converter._status, 1) # On systems where integer defaults to 32-bit, the statuses will be @@ -164,30 +163,30 @@ class TestStringConverter(object): status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize) # test int > 2**32 - assert_equal(converter.upgrade(b'17179869184'), 17179869184) + assert_equal(converter.upgrade('17179869184'), 17179869184) assert_equal(converter._status, 1 + status_offset) # test float - assert_allclose(converter.upgrade(b'0.'), 0.0) + assert_allclose(converter.upgrade('0.'), 0.0) assert_equal(converter._status, 2 + status_offset) # test complex - assert_equal(converter.upgrade(b'0j'), complex('0j')) + assert_equal(converter.upgrade('0j'), complex('0j')) assert_equal(converter._status, 3 + status_offset) - # test str - assert_equal(converter.upgrade(b'a'), b'a') - assert_equal(converter._status, len(converter._mapper) - 1) + # test str TODO + #assert_equal(converter.upgrade(b'a'), b'a') + #assert_equal(converter._status, len(converter._mapper) - 1) def test_missing(self): "Tests the use of missing values." - converter = StringConverter(missing_values=(b'missing', - b'missed')) - converter.upgrade(b'0') - assert_equal(converter(b'0'), 0) - assert_equal(converter(b''), converter.default) - assert_equal(converter(b'missing'), converter.default) - assert_equal(converter(b'missed'), converter.default) + converter = StringConverter(missing_values=('missing', + 'missed')) + converter.upgrade('0') + assert_equal(converter('0'), 0) + assert_equal(converter(''), converter.default) + assert_equal(converter('missing'), converter.default) + assert_equal(converter('missed'), converter.default) try: converter('miss') except ValueError: @@ -198,58 +197,58 @@ class TestStringConverter(object): dateparser = _bytes_to_date StringConverter.upgrade_mapper(dateparser, date(2000, 1, 1)) convert = StringConverter(dateparser, date(2000, 1, 1)) - test = convert(b'2001-01-01') + test = convert('2001-01-01') assert_equal(test, date(2001, 1, 1)) - test = convert(b'2009-01-01') + test = convert('2009-01-01') assert_equal(test, date(2009, 1, 1)) - test = convert(b'') + test = convert('') assert_equal(test, date(2000, 1, 1)) def test_string_to_object(self): "Make sure that string-to-object functions are properly recognized" conv = StringConverter(_bytes_to_date) - assert_equal(conv._mapper[-2][0](0), 0j) + assert_equal(conv._mapper[-3][0](0), 0j) assert_(hasattr(conv, 'default')) def test_keep_default(self): "Make sure we don't lose an explicit default" - converter = StringConverter(None, missing_values=b'', + converter = StringConverter(None, missing_values='', default=-999) - converter.upgrade(b'3.14159265') + converter.upgrade('3.14159265') assert_equal(converter.default, -999) assert_equal(converter.type, np.dtype(float)) # converter = StringConverter( - None, missing_values=b'', default=0) - converter.upgrade(b'3.14159265') + None, missing_values='', default=0) + converter.upgrade('3.14159265') assert_equal(converter.default, 0) assert_equal(converter.type, np.dtype(float)) def test_keep_default_zero(self): "Check that we don't lose a default of 0" converter = StringConverter(int, default=0, - missing_values=b"N/A") + missing_values="N/A") assert_equal(converter.default, 0) def test_keep_missing_values(self): "Check that we're not losing missing values" converter = StringConverter(int, default=0, - missing_values=b"N/A") + missing_values="N/A") assert_equal( - converter.missing_values, set([b'', b'N/A'])) + converter.missing_values, set(['', 'N/A'])) def test_int64_dtype(self): "Check that int64 integer types can be specified" converter = StringConverter(np.int64, default=0) - val = b"-9223372036854775807" + val = "-9223372036854775807" assert_(converter(val) == -9223372036854775807) - val = b"9223372036854775807" + val = "9223372036854775807" assert_(converter(val) == 9223372036854775807) def test_uint64_dtype(self): "Check that uint64 integer types can be specified" converter = StringConverter(np.uint64, default=0) - val = b"9223372043271415339" + val = "9223372043271415339" assert_(converter(val) == 9223372043271415339) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 6f7fcc54c..35c37c7be 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -8,8 +8,11 @@ from tempfile import NamedTemporaryFile import time import warnings import gc -from io import BytesIO +import io +from io import BytesIO, StringIO from datetime import datetime +import locale +import re import numpy as np import numpy.ma as ma @@ -17,11 +20,19 @@ from numpy.lib._iotools import ConverterError, ConversionWarning from numpy.compat import asbytes, bytes, unicode, Path from numpy.ma.testutils import assert_equal from numpy.testing import ( - run_module_suite, assert_warns, assert_, assert_raises_regex, - assert_raises, assert_allclose, assert_array_equal, temppath, dec, IS_PYPY, - suppress_warnings + run_module_suite, assert_warns, assert_, + assert_raises_regex, assert_raises, assert_allclose, + assert_array_equal, temppath, tempdir, dec, IS_PYPY, suppress_warnings ) +def can_encode(v): + """ check if bytes can be decoded with default encoding """ + try: + v.encode(locale.getpreferredencoding()) + return False # no skipping + except UnicodeEncodeError: + return True + class TextIO(BytesIO): """Helper IO class. @@ -44,6 +55,16 @@ class TextIO(BytesIO): MAJVER, MINVER = sys.version_info[:2] IS_64BIT = sys.maxsize > 2**32 +try: + import bz2 + HAS_BZ2 = True +except ImportError: + HAS_BZ2 = False +try: + import lzma + HAS_LZMA = True +except ImportError: + HAS_LZMA = False def strptime(s, fmt=None): @@ -52,10 +73,9 @@ def strptime(s, fmt=None): 2.5. """ - if sys.version_info[0] >= 3: - return datetime(*time.strptime(s.decode('latin1'), fmt)[:3]) - else: - return datetime(*time.strptime(s, fmt)[:3]) + if type(s) == bytes: + s = s.decode("latin1") + return datetime(*time.strptime(s, fmt)[:3]) class RoundtripTest(object): @@ -466,8 +486,135 @@ class TestSaveTxt(object): b = np.loadtxt(w) assert_array_equal(a, b) + def test_unicode(self): + utf8 = b'\xcf\x96'.decode('UTF-8') + a = np.array([utf8], dtype=np.unicode) + with tempdir() as tmpdir: + # set encoding as on windows it may not be unicode even on py3 + np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'], + encoding='UTF-8') + + def test_unicode_roundtrip(self): + utf8 = b'\xcf\x96'.decode('UTF-8') + a = np.array([utf8], dtype=np.unicode) + # our gz wrapper support encoding + suffixes = ['', '.gz'] + # stdlib 2 versions do not support encoding + if MAJVER > 2: + if HAS_BZ2: + suffixes.append('.bz2') + if HAS_LZMA: + suffixes.extend(['.xz', '.lzma']) + with tempdir() as tmpdir: + for suffix in suffixes: + np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a, + fmt=['%s'], encoding='UTF-16-LE') + b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix), + encoding='UTF-16-LE', dtype=np.unicode) + assert_array_equal(a, b) + + def test_unicode_bytestream(self): + utf8 = b'\xcf\x96'.decode('UTF-8') + a = np.array([utf8], dtype=np.unicode) + s = BytesIO() + np.savetxt(s, a, fmt=['%s'], encoding='UTF-8') + s.seek(0) + assert_equal(s.read().decode('UTF-8'), utf8 + '\n') + + def test_unicode_stringstream(self): + utf8 = b'\xcf\x96'.decode('UTF-8') + a = np.array([utf8], dtype=np.unicode) + s = StringIO() + np.savetxt(s, a, fmt=['%s'], encoding='UTF-8') + s.seek(0) + assert_equal(s.read(), utf8 + '\n') + + +class LoadTxtBase: + def check_compressed(self, fopen, suffixes): + # Test that we can load data from a compressed file + wanted = np.arange(6).reshape((2, 3)) + linesep = ('\n', '\r\n', '\r') + for sep in linesep: + data = '0 1 2' + sep + '3 4 5' + for suffix in suffixes: + with temppath(suffix=suffix) as name: + with fopen(name, mode='wt', encoding='UTF-32-LE') as f: + f.write(data) + res = getattr(np, self.loadfunc)(name, + encoding='UTF-32-LE') + assert_array_equal(res, wanted) + res = getattr(np, self.loadfunc)( + fopen(name, "rt", encoding='UTF-32-LE')) + assert_array_equal(res, wanted) + + # Python2 .open does not support encoding + @np.testing.dec.skipif(MAJVER == 2) + def test_compressed_gzip(self): + self.check_compressed(gzip.open, ('.gz',)) + + @np.testing.dec.skipif(MAJVER == 2 or not HAS_BZ2) + def test_compressed_gzip(self): + self.check_compressed(bz2.open, ('.bz2',)) + + @np.testing.dec.skipif(MAJVER == 2 or not HAS_LZMA) + def test_compressed_gzip(self): + self.check_compressed(lzma.open, ('.xz', '.lzma')) + + def test_encoding(self): + with temppath() as path: + with open(path, "wb") as f: + f.write('0.\n1.\n2.'.encode("UTF-16")) + x = getattr(np, self.loadfunc)(path, encoding="UTF-16") + assert_array_equal(x, [0., 1., 2.]) + + def test_stringload(self): + # umlaute + nonascii = b'\xc3\xb6\xc3\xbc\xc3\xb6'.decode("UTF-8") + with temppath() as path: + with open(path, "wb") as f: + f.write(nonascii.encode("UTF-16")) + x = getattr(np, self.loadfunc)(path, encoding="UTF-16", dtype=np.unicode) + assert_array_equal(x, nonascii) + + def test_binary_decode(self): + utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04' + v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=np.unicode, + encoding='UTF-16') + assert_array_equal(v, np.array(utf16.decode('UTF-16').split())) + + def test_converters_decode(self): + # test converters that decode strings + c = TextIO() + c.write(b'\xcf\x96') + c.seek(0) + x = getattr(np, self.loadfunc)(c, dtype=np.unicode, + converters={0: lambda x: x.decode('UTF-8')}) + a = np.array([b'\xcf\x96'.decode('UTF-8')]) + assert_array_equal(x, a) + + def test_converters_nodecode(self): + # test native string converters enabled by setting an encoding + utf8 = b'\xcf\x96'.decode('UTF-8') + with temppath() as path: + with io.open(path, 'wt', encoding='UTF-8') as f: + f.write(utf8) + x = getattr(np, self.loadfunc)(path, dtype=np.unicode, + converters={0: lambda x: x + 't'}, + encoding='UTF-8') + a = np.array([utf8 + 't']) + assert_array_equal(x, a) + + +class TestLoadTxt(LoadTxtBase): + loadfunc = 'loadtxt' + def setUp(self): + # lower chunksize for testing + self.orig_chunk = np.lib.npyio._loadtxt_chunksize + np.lib.npyio._loadtxt_chunksize = 1 + def tearDown(self): + np.lib.npyio._loadtxt_chunksize = self.orig_chunk -class TestLoadTxt(object): def test_record(self): c = TextIO() c.write('1 2\n3 4') @@ -869,9 +1016,24 @@ class TestLoadTxt(object): dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)]) np.loadtxt(c, delimiter=',', dtype=dt, comments=None) # Should succeed + @np.testing.dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968') + def test_binary_load(self): + butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\ + b"20,2,3,\xc3\x95scar\n\r" + sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines() + with temppath() as path: + with open(path, "wb") as f: + f.write(butf8) + with open(path, "rb") as f: + x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode) + assert_array_equal(x, sutf8) + # test broken latin1 conversion people now rely on + with open(path, "rb") as f: + x = np.loadtxt(f, encoding="UTF-8", dtype="S") + x = [b'5,6,7,\xc3\x95scarscar', b'15,2,3,hello', b'20,2,3,\xc3\x95scar'] + assert_array_equal(x, np.array(x, dtype="S")) class Testfromregex(object): - # np.fromregex expects files opened in binary mode. def test_record(self): c = TextIO() c.write('1.312 foo\n1.534 bar\n4.444 qux') @@ -904,12 +1066,28 @@ class Testfromregex(object): a = np.array([(1312,), (1534,), (4444,)], dtype=dt) assert_array_equal(x, a) + def test_record_unicode(self): + utf8 = b'\xcf\x96' + with temppath() as path: + with open(path, 'wb') as f: + f.write(b'1.312 foo' + utf8 + b' \n1.534 bar\n4.444 qux') + + dt = [('num', np.float64), ('val', 'U4')] + x = np.fromregex(path, r"(?u)([0-9.]+)\s+(\w+)", dt, encoding='UTF-8') + a = np.array([(1.312, 'foo' + utf8.decode('UTF-8')), (1.534, 'bar'), + (4.444, 'qux')], dtype=dt) + assert_array_equal(x, a) + + regexp = re.compile(r"([0-9.]+)\s+(\w+)", re.UNICODE) + x = np.fromregex(path, regexp, dt, encoding='UTF-8') + assert_array_equal(x, a) + #####-------------------------------------------------------------------------- -class TestFromTxt(object): - # +class TestFromTxt(LoadTxtBase): + loadfunc = 'genfromtxt' def test_record(self): # Test w/ explicit dtype data = TextIO('1 2\n3 4') @@ -1012,7 +1190,10 @@ class TestFromTxt(object): def test_header(self): # Test retrieving a header data = TextIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0') - test = np.ndfromtxt(data, dtype=None, names=True) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.ndfromtxt(data, dtype=None, names=True) + assert_(w[0].category is np.VisibleDeprecationWarning) control = {'gender': np.array([b'M', b'F']), 'age': np.array([64.0, 25.0]), 'weight': np.array([75.0, 60.0])} @@ -1023,7 +1204,10 @@ class TestFromTxt(object): def test_auto_dtype(self): # Test the automatic definition of the output dtype data = TextIO('A 64 75.0 3+4j True\nBCD 25 60.0 5+6j False') - test = np.ndfromtxt(data, dtype=None) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.ndfromtxt(data, dtype=None) + assert_(w[0].category is np.VisibleDeprecationWarning) control = [np.array([b'A', b'BCD']), np.array([64, 25]), np.array([75.0, 60.0]), @@ -1069,7 +1253,10 @@ F 35 58.330000 M 33 21.99 """) # The # is part of the first name and should be deleted automatically. - test = np.genfromtxt(data, names=True, dtype=None) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(data, names=True, dtype=None) + assert_(w[0].category is np.VisibleDeprecationWarning) ctrl = np.array([('M', 21, 72.1), ('F', 35, 58.33), ('M', 33, 21.99)], dtype=[('gender', '|S1'), ('age', int), ('weight', float)]) assert_equal(test, ctrl) @@ -1080,14 +1267,20 @@ M 21 72.100000 F 35 58.330000 M 33 21.99 """) - test = np.genfromtxt(data, names=True, dtype=None) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(data, names=True, dtype=None) + assert_(w[0].category is np.VisibleDeprecationWarning) assert_equal(test, ctrl) def test_autonames_and_usecols(self): # Tests names and usecols data = TextIO('A B C D\n aaaa 121 45 9.1') - test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), - names=True, dtype=None) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), + names=True, dtype=None) + assert_(w[0].category is np.VisibleDeprecationWarning) control = np.array(('aaaa', 45, 9.1), dtype=[('A', '|S4'), ('C', int), ('D', float)]) assert_equal(test, control) @@ -1104,8 +1297,12 @@ M 33 21.99 def test_converters_with_usecols_and_names(self): # Tests names and usecols data = TextIO('A B C D\n aaaa 121 45 9.1') - test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True, - dtype=None, converters={'C': lambda s: 2 * int(s)}) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True, + dtype=None, + converters={'C': lambda s: 2 * int(s)}) + assert_(w[0].category is np.VisibleDeprecationWarning) control = np.array(('aaaa', 90, 9.1), dtype=[('A', '|S4'), ('C', int), ('D', float)]) assert_equal(test, control) @@ -1225,6 +1422,18 @@ M 33 21.99 dtype=[('', '|S10'), ('', float)]) assert_equal(test, control) + def test_utf8_userconverters_with_explicit_dtype(self): + utf8 = b'\xcf\x96' + with temppath() as path: + with open(path, 'wb') as f: + f.write(b'skip,skip,2001-01-01' + utf8 + b',1.0,skip') + test = np.genfromtxt(path, delimiter=",", names=None, dtype=float, + usecols=(2, 3), converters={2: np.unicode}, + encoding='UTF-8') + control = np.array([('2001-01-01' + utf8.decode('UTF-8'), 1.)], + dtype=[('', '|U11'), ('', float)]) + assert_equal(test, control) + def test_spacedelimiter(self): # Test space delimiter data = TextIO("1 2 3 4 5\n6 7 8 9 10") @@ -1551,11 +1760,17 @@ M 33 21.99 # Test autostrip data = "01/01/2003 , 1.3, abcde" kwargs = dict(delimiter=",", dtype=None) - mtest = np.ndfromtxt(TextIO(data), **kwargs) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + mtest = np.ndfromtxt(TextIO(data), **kwargs) + assert_(w[0].category is np.VisibleDeprecationWarning) ctrl = np.array([('01/01/2003 ', 1.3, ' abcde')], dtype=[('f0', '|S12'), ('f1', float), ('f2', '|S8')]) assert_equal(mtest, ctrl) - mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs) + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs) + assert_(w[0].category is np.VisibleDeprecationWarning) ctrl = np.array([('01/01/2003', 1.3, 'abcde')], dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')]) assert_equal(mtest, ctrl) @@ -1675,13 +1890,116 @@ M 33 21.99 def test_comments_is_none(self): # Github issue 329 (None was previously being converted to 'None'). - test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"), - dtype=None, comments=None, delimiter=',') + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"), + dtype=None, comments=None, delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) assert_equal(test[1], b'testNonetherestofthedata') - test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"), - dtype=None, comments=None, delimiter=',') + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"), + dtype=None, comments=None, delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) assert_equal(test[1], b' testNonetherestofthedata') + def test_latin1(self): + latin1 = b'\xf6\xfc\xf6' + norm = b"norm1,norm2,norm3\n" + enc = b"test1,testNonethe" + latin1 + b",test3\n" + s = norm + enc + norm + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(TextIO(s), + dtype=None, comments=None, delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) + assert_equal(test[1, 0], b"test1") + assert_equal(test[1, 1], b"testNonethe" + latin1) + assert_equal(test[1, 2], b"test3") + test = np.genfromtxt(TextIO(s), + dtype=None, comments=None, delimiter=',', + encoding='latin1') + assert_equal(test[1, 0], u"test1") + assert_equal(test[1, 1], u"testNonethe" + latin1.decode('latin1')) + assert_equal(test[1, 2], u"test3") + + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(TextIO(b"0,testNonethe" + latin1), + dtype=None, comments=None, delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) + assert_equal(test['f0'], 0) + assert_equal(test['f1'], b"testNonethe" + latin1) + + def test_binary_decode_autodtype(self): + utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04' + v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=None, + encoding='UTF-16') + assert_array_equal(v, np.array(utf16.decode('UTF-16').split())) + + def test_utf8_byte_encoding(self): + utf8 = b"\xcf\x96" + norm = b"norm1,norm2,norm3\n" + enc = b"test1,testNonethe" + utf8 + b",test3\n" + s = norm + enc + norm + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) + test = np.genfromtxt(TextIO(s), + dtype=None, comments=None, delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) + ctl = np.array([ + [b'norm1', b'norm2', b'norm3'], + [b'test1', b'testNonethe' + utf8, b'test3'], + [b'norm1', b'norm2', b'norm3']]) + assert_array_equal(test, ctl) + + def test_utf8_file(self): + utf8 = b"\xcf\x96" + latin1 = b"\xf6\xfc\xf6" + with temppath() as path: + with open(path, "wb") as f: + f.write((b"test1,testNonethe" + utf8 + b",test3\n") * 2) + test = np.genfromtxt(path, dtype=None, comments=None, + delimiter=',', encoding="UTF-8") + ctl = np.array([ + ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"], + ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]], + dtype=np.unicode) + assert_array_equal(test, ctl) + + # test a mixed dtype + with open(path, "wb") as f: + f.write(b"0,testNonethe" + utf8) + test = np.genfromtxt(path, dtype=None, comments=None, + delimiter=',', encoding="UTF-8") + assert_equal(test['f0'], 0) + assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8")) + + @np.testing.dec.skipif(can_encode(b"\xcf\x96".decode('UTF-8'))) + def test_utf8_file_nodtype_unicode(self): + # bytes encoding with non-latin1 -> unicode upcast + utf8 = b"\xcf\x96" + latin1 = b"\xf6\xfc\xf6" + with temppath() as path: + with io.open(path, "wt", + encoding=locale.getpreferredencoding()) as f: + f.write(u"norm1,norm2,norm3\n") + f.write(u"norm1," + latin1.decode("latin1") + u",norm3\n") + f.write(u"test1,testNonethe" + utf8.decode("UTF-8") + + u",test3\n") + with warnings.catch_warnings(record=True) as w: + warnings.filterwarnings('always', '', + np.VisibleDeprecationWarning) + test = np.genfromtxt(path, dtype=None, comments=None, + delimiter=',') + assert_(w[0].category is np.VisibleDeprecationWarning) + ctl = np.array([ + ["norm1", "norm2", "norm3"], + ["norm1", latin1.decode("latin1"), "norm3"], + ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]], + dtype=np.unicode) + assert_array_equal(test, ctl) + def test_recfromtxt(self): # data = TextIO('A,B\n0,1\n2,3') @@ -1793,11 +2111,7 @@ M 33 21.99 # Test that we can load data from a filename as well as a file # object tgt = np.arange(6).reshape((2, 3)) - if sys.version_info[0] >= 3: - # python 3k is known to fail for '\r' - linesep = ('\n', '\r\n') - else: - linesep = ('\n', '\r\n', '\r') + linesep = ('\n', '\r\n', '\r') for sep in linesep: data = '0 1 2' + sep + '3 4 5' @@ -1807,6 +2121,22 @@ M 33 21.99 res = np.genfromtxt(name) assert_array_equal(res, tgt) + def test_gft_from_gzip(self): + # Test that we can load data from a gzipped file + wanted = np.arange(6).reshape((2, 3)) + linesep = ('\n', '\r\n', '\r') + + for sep in linesep: + data = '0 1 2' + sep + '3 4 5' + s = BytesIO() + with gzip.GzipFile(fileobj=s, mode='w') as g: + g.write(asbytes(data)) + + with temppath(suffix='.gz2') as name: + with open(name, 'w') as f: + f.write(data) + assert_array_equal(np.genfromtxt(name), wanted) + def test_gft_using_generator(self): # gft doesn't work with unicode. def count(): -- cgit v1.2.1 From 55273d236945aa5f4b6e01682dfef82384a7fd65 Mon Sep 17 00:00:00 2001 From: Charles Harris Date: Sun, 19 Nov 2017 11:51:55 -0700 Subject: DOC: Add some docstrings and edit others. Add docstrings for some of the support functions in _datasource and npyio in order to aid future maintainers. [ci skip] --- numpy/lib/_datasource.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- numpy/lib/npyio.py | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 17 deletions(-) (limited to 'numpy/lib') diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py index ad939df3f..1b5ecb34e 100644 --- a/numpy/lib/_datasource.py +++ b/numpy/lib/_datasource.py @@ -43,6 +43,18 @@ import io _open = open def _check_mode(mode, encoding, newline): + """Check mode and that encoding and newline are compatible. + + Parameters + ---------- + mode : str + File open mode. + encoding : str + File encoding. + newline : str + Newline for text files. + + """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) @@ -52,8 +64,21 @@ def _check_mode(mode, encoding, newline): if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") + def _python2_bz2open(fn, mode, encoding, newline): - """ wrapper to open bz2 in text mode """ + """Wrapper to open bz2 in text mode. + + Parameters + ---------- + fn : str + File name + mode : {'r', 'w'} + File mode. Note that bz2 Text files are not supported. + encoding : str + Ignored, text bz2 files not supported in Python2. + newline : str + Ignored, text bz2 files not supported in Python2. + """ import bz2 _check_mode(mode, encoding, newline) @@ -65,7 +90,21 @@ def _python2_bz2open(fn, mode, encoding, newline): return bz2.BZ2File(fn, mode) def _python2_gzipopen(fn, mode, encoding, newline): - """ wrapper to open gzip in text mode """ + """ Wrapper to open gzip in text mode. + + Parameters + ---------- + fn : str, bytes, file + File path or opened file. + mode : str + File mode. The actual files are opened as binary, but will decoded + using the specified `encoding` and `newline`. + encoding : str + Encoding to be used when reading/writing as text. + newline : str + Newline to be used when reading/writing as text. + + """ import gzip # gzip is lacking read1 needed for TextIOWrapper class GzipWrap(gzip.GzipFile): @@ -75,6 +114,7 @@ def _python2_gzipopen(fn, mode, encoding, newline): _check_mode(mode, encoding, newline) gz_mode = mode.replace("t", "") + if isinstance(fn, (str, bytes)): binary_file = GzipWrap(fn, gz_mode) elif hasattr(fn, "read") or hasattr(fn, "write"): diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index fe2aa436b..6b65834ed 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -296,7 +296,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, used in Python 3. encoding : str, optional What encoding to use when reading Python 2 strings. Only useful when - loading Python 2 generated pickled files on Python 3, which includes + loading Python 2 generated pickled files in Python 3, which includes npy/npz files containing object arrays. Values other than 'latin1', 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical data. Default: 'ASCII' @@ -819,13 +819,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Legal values: 0 (default), 1 or 2. .. versionadded:: 1.6.0 - encoding: string, optional + encoding : str, optional Encoding used to decode the inputfile. Does not apply to input streams. The special value 'bytes' enables backward compatibility workarounds that ensures you receive byte arrays as results if possible and passes latin1 encoded strings to converters. Override this value to receive - unicode arrays and pass strings as input to converters. - If set to None the system default is used. + unicode arrays and pass strings as input to converters. If set to None + the system default is used. The default value is 'bytes'. .. versionadded:: 1.14.0 @@ -993,7 +993,17 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, return [] def read_data(chunk_size): - # Parse each line, including the first + """Parse each line, including the first. + + The file read, `fh`, is a global defined above. + + Parameters + ---------- + chunk_size : int + At most `chunk_size` lines are read at a time, with iteration + until all lines are read. + + """ X = [] for i, line in enumerate(itertools.chain([first_line], fh)): vals = split_line(line) @@ -1171,7 +1181,7 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', ``numpy.loadtxt``. .. versionadded:: 1.7.0 - encoding: string, optional + encoding : str, optional Encoding used to encode the outputfile. Does not apply to output streams. @@ -1251,7 +1261,9 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', delimiter = asstr(delimiter) class WriteWrap(object): - """ convert to unicode in py2 or to bytes on bytestream inputs """ + """Convert to unicode in py2 or to bytes on bytestream inputs. + + """ def __init__(self, fh, encoding): self.fh = fh self.encoding = encoding @@ -1387,7 +1399,7 @@ def fromregex(file, regexp, dtype, encoding=None): Groups in the regular expression correspond to fields in the dtype. dtype : dtype or list of dtypes Dtype for the structured array. - encoding: string, optional + encoding : str, optional Encoding used to decode the inputfile. Does not apply to input streams. .. versionadded:: 1.14.0 @@ -1562,13 +1574,13 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, to read the entire file. .. versionadded:: 1.10.0 - encoding: string, optional - Encoding used to decode the inputfile. Does not apply to input streams. - The special value 'bytes' enables backward compatibility workarounds - that ensures you receive byte arrays as results if possible and passes - latin1 encoded strings to converters. Override this value to receive - unicode arrays and pass strings as input to converters. - If set to None the system default is used. + encoding : str, optional + Encoding used to decode the inputfile. Does not apply when `fname` is + a file object. The special value 'bytes' enables backward compatibility + workarounds that ensure that you receive byte arrays when possible + and passes latin1 encoded strings to converters. Override this value to + receive unicode arrays and pass strings as input to converters. If set + to None the system default is used. The default value is 'bytes'. .. versionadded:: 1.14.0 -- cgit v1.2.1 From d9ca11117f37d48d07818a3aae3641c023454269 Mon Sep 17 00:00:00 2001 From: Charles Harris Date: Sun, 19 Nov 2017 13:43:32 -0700 Subject: MAINT: Refactor some code in npyio.py. --- numpy/lib/_datasource.py | 8 ++++- numpy/lib/_iotools.py | 48 ++++++++++++++++++------- numpy/lib/npyio.py | 78 ++++++++++++++++++++-------------------- numpy/lib/tests/test__iotools.py | 2 -- 4 files changed, 81 insertions(+), 55 deletions(-) (limited to 'numpy/lib') diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py index 1b5ecb34e..aec84865f 100644 --- a/numpy/lib/_datasource.py +++ b/numpy/lib/_datasource.py @@ -164,6 +164,7 @@ class _FileOpeners(object): def _load(self): if self._loaded: return + try: import bz2 if sys.version_info[0] >= 3: @@ -172,6 +173,7 @@ class _FileOpeners(object): self._file_openers[".bz2"] = _python2_bz2open except ImportError: pass + try: import gzip if sys.version_info[0] >= 3: @@ -180,12 +182,16 @@ class _FileOpeners(object): self._file_openers[".gz"] = _python2_gzipopen except ImportError: pass + try: import lzma self._file_openers[".xz"] = lzma.open self._file_openers[".lzma"] = lzma.open - except ImportError: + except (ImportError, AttributeError): + # There are incompatible backports of lzma that do not have the + # lzma.open attribute, so catch that as well as ImportError. pass + self._loaded = True def keys(self): diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 8e091d42d..b7db77f32 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -18,7 +18,22 @@ else: def _decode_line(line, encoding=None): - """ decode bytes from binary input streams, default to latin1 """ + """Decode bytes from binary input streams. + + Defaults to decoding from 'latin1'. That differs from the behavior of + np.compat.asunicode that decodes from 'ascii'. + + Parameters + ---------- + line : str or bytes + Line to be decoded. + + Returns + ------- + decoded_line : unicode + Unicode in Python 2, a str (unicode) in Python 3. + + """ if type(line) is bytes: if encoding is None: line = line.decode('latin1') @@ -510,8 +525,10 @@ class StringConverter(object): Value to return by default, that is, when the string to be converted is flagged as missing. If not given, `StringConverter` tries to supply a reasonable default value. - missing_values : sequence of str, optional - Sequence of strings indicating a missing value. + missing_values : {None, sequence of str}, optional + ``None`` or sequence of strings indicating a missing value. If ``None`` + then missing values are indicated by empty entries. The default is + ``None``. locked : bool, optional Whether the StringConverter should be locked to prevent automatic upgrade or not. Default is False. @@ -813,8 +830,9 @@ class StringConverter(object): A string representing a standard input value of the converter. This string is used to help defining a reasonable default value. - missing_values : sequence of str, optional - Sequence of strings indicating a missing value. + missing_values : {sequence of str, None}, optional + Sequence of strings indicating a missing value. If ``None``, then + the existing `missing_values` are cleared. The default is `''`. locked : bool, optional Whether the StringConverter should be locked to prevent automatic upgrade or not. Default is False. @@ -828,6 +846,7 @@ class StringConverter(object): """ self.func = func self._locked = locked + # Don't reset the default to None if we can avoid it if default is not None: self.default = default @@ -838,15 +857,18 @@ class StringConverter(object): except (TypeError, ValueError): tester = None self.type = self._dtypeortype(self._getdtype(tester)) - # Add the missing values to the existing set - if missing_values is not None: - if isinstance(missing_values, basestring): - self.missing_values.add(missing_values) - elif hasattr(missing_values, '__iter__'): - for val in missing_values: - self.missing_values.add(val) + + # Add the missing values to the existing set or clear it. + if missing_values is None: + # Clear all missing values even though the ctor initializes it to + # {''} when the argument is None. + self.missing_values = {} else: - self.missing_values = [] + if not np.iterable(missing_values): + missing_values = [missing_values] + if not all(isinstance(v, basestring) for v in missing_values): + raise TypeError("missing_values must be strings or unicode") + self.missing_values.update(missing_values) def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs): diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6b65834ed..e4d827334 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -734,7 +734,7 @@ def _getconv(dtype): def floatconv(x): x.lower() if '0x' in x: - return float.fromhex(asstr(x)) + return float.fromhex(x) return float(x) typ = dtype.type @@ -782,13 +782,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str or sequence, optional + comments : str or sequence of str, optional The characters or list of characters used to indicate the start of a - comment; - default: '#'. + comment. For backwards compatibility, byte strings will be decoded as + 'latin1'. The default is '#'. delimiter : str, optional - The string used to separate values. By default, this is any - whitespace. + The string used to separate values. For backwards compatibility, byte + strings will be decoded as 'latin1'. The default is whitespace. converters : dict, optional A dictionary mapping column number to a function that will convert that column to a float. E.g., if column 0 is a date string: @@ -797,18 +797,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. skiprows : int, optional Skip the first `skiprows` lines; default: 0. - usecols : int or sequence, optional Which columns to read, with 0 being the first. For example, usecols = (1,4,5) will extract the 2nd, 5th and 6th columns. The default, None, results in all columns being read. - .. versionadded:: 1.11.0 - - Also when a single column has to be read it is possible to use - an integer instead of a tuple. E.g ``usecols = 3`` reads the - fourth column the same way as `usecols = (3,)`` would. - + .. versionchanged:: 1.11.0 + When a single column has to be read it is possible to use + an integer instead of a tuple. E.g ``usecols = 3`` reads the + fourth column the same way as `usecols = (3,)`` would. unpack : bool, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)``. When used with a structured @@ -877,12 +874,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, if comments is not None: if isinstance(comments, (basestring, bytes)): comments = [comments] - comments = [_decode_line(x) for x in comments] - # Compile regex for comments beforehand comments = (re.escape(comment) for comment in comments) regex_comments = re.compile('|'.join(comments)) + + if delimiter is not None: + delimiter = _decode_line(delimiter) + user_converters = converters if encoding == 'bytes': @@ -1071,7 +1070,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, # Unused converter specified continue if byte_converters: - # converters may use decode to workaround numpy's oldd behaviour, + # converters may use decode to workaround numpy's old behaviour, # so encode the string again before passing to the user converter def tobytes_first(x, conv): if type(x) is bytes: @@ -1181,9 +1180,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', ``numpy.loadtxt``. .. versionadded:: 1.7.0 - encoding : str, optional + encoding : {None, str}, optional Encoding used to encode the outputfile. Does not apply to output - streams. + streams. If the encoding is something other than 'bytes' or 'latin1' + you will not be able to load the file in NumPy versions < 1.14. Default + is 'latin1'. .. versionadded:: 1.14.0 @@ -1908,7 +1909,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if conv is bytes: user_conv = asbytes elif byte_converters: - # converters may use decode to workaround numpy's oldd behaviour, + # converters may use decode to workaround numpy's old behaviour, # so encode the string again before passing to the user converter def tobytes_first(x, conv): if type(x) is bytes: @@ -1927,7 +1928,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, user_converters.update(uc_update) # Fixme: possible error as following variable never used. - #miss_chars = [_.missing_values for _ in converters] + # miss_chars = [_.missing_values for _ in converters] # Initialize the output lists ... # ... rows @@ -2041,39 +2042,38 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, strcolidx = [i for (i, v) in enumerate(column_types) if v == np.unicode_] - typestr = 'U' + type_str = np.unicode_ if byte_converters and strcolidx: # convert strings back to bytes for backward compatibility warnings.warn( - "Reading strings without specifying the encoding argument is " - "deprecated. Set encoding, use None for the system default.", + "Reading unicode strings without specifying the encoding " + "argument is deprecated. Set the encoding, use None for the " + "system default.", np.VisibleDeprecationWarning, stacklevel=2) + def encode_unicode_cols(row_tup): + row = list(row_tup) + for i in strcolidx: + row[i] = row[i].encode('latin1') + return tuple(row) + try: - for j in range(len(data)): - row = list(data[j]) - for i in strcolidx: - row[i] = row[i].encode('latin1') - data[j] = tuple(row) - typestr = 'S' + data = [encode_unicode_cols(r) for r in data] + type_str = np.bytes_ except UnicodeEncodeError: - # we must use unicode, revert encoding - for k in range(0, j + 1): - row = list(data[k]) - for i in strcolidx: - if isinstance(row[i], bytes): - row[i] = row[i].decode('latin1') - data[k] = tuple(row) + pass + # ... and take the largest number of chars. for i in strcolidx: - column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data)) + max_line_length = max(len(row[i]) for row in data) + column_types[i] = np.dtype((type_str, max_line_length)) # if names is None: # If the dtype is uniform, don't define names, else use '' base = set([c.type for c in converters if c._checked]) if len(base) == 1: if strcolidx: - (ddtype, mdtype) = (typestr, bool) + (ddtype, mdtype) = (type_str, bool) else: (ddtype, mdtype) = (list(base)[0], bool) else: @@ -2148,7 +2148,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Try to take care of the missing data we missed names = output.dtype.names if usemask and names: - for (name, conv) in zip(names or (), converters): + for (name, conv) in zip(names, converters): missing_values = [conv(_) for _ in conv.missing_values if _ != ''] for mval in missing_values: diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py index 990ee126d..b25b42f8c 100644 --- a/numpy/lib/tests/test__iotools.py +++ b/numpy/lib/tests/test__iotools.py @@ -133,8 +133,6 @@ class TestNameValidator(object): def _bytes_to_date(s): - if type(s) == bytes: - s = s.decode("latin1") return date(*time.strptime(s, "%Y-%m-%d")[:3]) -- cgit v1.2.1 From 1d97b3aafdca2722bbe2f0c10a96544121c8f78b Mon Sep 17 00:00:00 2001 From: Charles Harris Date: Tue, 21 Nov 2017 10:10:26 -0700 Subject: MAINT: Various minor code cleanups. Minor cleanups of old code to reflect more modern usage. --- numpy/lib/_datasource.py | 15 +++++ numpy/lib/_iotools.py | 4 +- numpy/lib/tests/test__iotools.py | 20 +++++-- numpy/lib/tests/test_io.py | 115 ++++++++++++++++++++------------------- 4 files changed, 90 insertions(+), 64 deletions(-) (limited to 'numpy/lib') diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py index aec84865f..6f1295f09 100644 --- a/numpy/lib/_datasource.py +++ b/numpy/lib/_datasource.py @@ -238,6 +238,11 @@ def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): Path to the directory where the source file gets downloaded to for use. If `destpath` is None, a temporary directory will be created. The default path is the current directory. + encoding : {None, str}, optional + Open text file with given encoding. The default encoding will be + what `io.open` uses. + newline : {None, str}, optional + Newline to use when reading text file. Returns ------- @@ -577,6 +582,11 @@ class DataSource (object): Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to append. Available modes depend on the type of object specified by `path`. Default is 'r'. + encoding : {None, str}, optional + Open text file with given encoding. The default encoding will be + what `io.open` uses. + newline : {None, str}, optional + Newline to use when reading text file. Returns ------- @@ -741,6 +751,11 @@ class Repository (DataSource): Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to append. Available modes depend on the type of object specified by `path`. Default is 'r'. + encoding : {None, str}, optional + Open text file with given encoding. The default encoding will be + what `io.open` uses. + newline : {None, str}, optional + Newline to use when reading text file. Returns ------- diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index b7db77f32..27143e5c6 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -861,8 +861,8 @@ class StringConverter(object): # Add the missing values to the existing set or clear it. if missing_values is None: # Clear all missing values even though the ctor initializes it to - # {''} when the argument is None. - self.missing_values = {} + # set(['']) when the argument is None. + self.missing_values = set() else: if not np.iterable(missing_values): missing_values = [missing_values] diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py index b25b42f8c..54fac8da4 100644 --- a/numpy/lib/tests/test__iotools.py +++ b/numpy/lib/tests/test__iotools.py @@ -12,6 +12,7 @@ from numpy.lib._iotools import ( LineSplitter, NameValidator, StringConverter, has_nested_fields, easy_dtype, flatten_dtype ) +from numpy.compat import unicode class TestLineSplitter(object): @@ -155,10 +156,10 @@ class TestStringConverter(object): assert_equal(converter.upgrade('0'), 0) assert_equal(converter._status, 1) - # On systems where integer defaults to 32-bit, the statuses will be + # On systems where long defaults to 32-bit, the statuses will be # offset by one, so we check for this here. import numpy.core.numeric as nx - status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize) + status_offset = int(nx.dtype(nx.int_).itemsize < nx.dtype(nx.int64).itemsize) # test int > 2**32 assert_equal(converter.upgrade('17179869184'), 17179869184) @@ -172,9 +173,15 @@ class TestStringConverter(object): assert_equal(converter.upgrade('0j'), complex('0j')) assert_equal(converter._status, 3 + status_offset) - # test str TODO - #assert_equal(converter.upgrade(b'a'), b'a') - #assert_equal(converter._status, len(converter._mapper) - 1) + # test str + # note that the longdouble type has been skipped, so the + # _status increases by 2. Everything should succeed with + # unicode conversion (5). + for s in ['a', u'a', b'a']: + res = converter.upgrade(s) + assert_(type(res) is unicode) + assert_equal(res, u'a') + assert_equal(converter._status, 5 + status_offset) def test_missing(self): "Tests the use of missing values." @@ -204,8 +211,9 @@ class TestStringConverter(object): def test_string_to_object(self): "Make sure that string-to-object functions are properly recognized" + old_mapper = StringConverter._mapper[:] # copy of list conv = StringConverter(_bytes_to_date) - assert_equal(conv._mapper[-3][0](0), 0j) + assert_equal(conv._mapper, old_mapper) assert_(hasattr(conv, 'default')) def test_keep_default(self): diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 35c37c7be..75a8e4968 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -20,19 +20,11 @@ from numpy.lib._iotools import ConverterError, ConversionWarning from numpy.compat import asbytes, bytes, unicode, Path from numpy.ma.testutils import assert_equal from numpy.testing import ( - run_module_suite, assert_warns, assert_, + run_module_suite, assert_warns, assert_, SkipTest, assert_raises_regex, assert_raises, assert_allclose, assert_array_equal, temppath, tempdir, dec, IS_PYPY, suppress_warnings ) -def can_encode(v): - """ check if bytes can be decoded with default encoding """ - try: - v.encode(locale.getpreferredencoding()) - return False # no skipping - except UnicodeEncodeError: - return True - class TextIO(BytesIO): """Helper IO class. @@ -164,7 +156,7 @@ class RoundtripTest(object): a = np.array([1, 2, 3, 4], int) self.roundtrip(a) - @np.testing.dec.knownfailureif(sys.platform == 'win32', "Fail on Win32") + @dec.knownfailureif(sys.platform == 'win32', "Fail on Win32") def test_mmap(self): a = np.array([[1, 2.5], [4, 7.3]]) self.roundtrip(a, file_on_disk=True, load_kwds={'mmap_mode': 'r'}) @@ -208,8 +200,8 @@ class TestSavezLoad(RoundtripTest): self.arr_reloaded.fid.close() os.remove(self.arr_reloaded.fid.name) - @np.testing.dec.skipif(not IS_64BIT, "Works only with 64bit systems") - @np.testing.dec.slow + @dec.skipif(not IS_64BIT, "Works only with 64bit systems") + @dec.slow def test_big_arrays(self): L = (1 << 31) + 100000 a = np.empty(L, dtype=np.uint8) @@ -285,7 +277,7 @@ class TestSavezLoad(RoundtripTest): fp.seek(0) assert_(not fp.closed) - @np.testing.dec.skipif(IS_PYPY, "context manager required on PyPy") + @dec.skipif(IS_PYPY, "context manager required on PyPy") def test_closing_fid(self): # Test that issue #1517 (too many opened files) remains closed # It might be a "weak" test since failed to get triggered on @@ -351,8 +343,8 @@ class TestSaveTxt(object): def test_0D_3D(self): c = BytesIO() - assert_raises(ValueError, np.savetxt, c, np.array(1)) - assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]])) + assert_raises(ValueError, np.savetxt, c, np.array(1)) + assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]])) def test_record(self): @@ -530,7 +522,7 @@ class TestSaveTxt(object): assert_equal(s.read(), utf8 + '\n') -class LoadTxtBase: +class LoadTxtBase(object): def check_compressed(self, fopen, suffixes): # Test that we can load data from a compressed file wanted = np.arange(6).reshape((2, 3)) @@ -541,23 +533,22 @@ class LoadTxtBase: with temppath(suffix=suffix) as name: with fopen(name, mode='wt', encoding='UTF-32-LE') as f: f.write(data) - res = getattr(np, self.loadfunc)(name, - encoding='UTF-32-LE') + res = self.loadfunc(name, encoding='UTF-32-LE') assert_array_equal(res, wanted) - res = getattr(np, self.loadfunc)( - fopen(name, "rt", encoding='UTF-32-LE')) + with fopen(name, "rt", encoding='UTF-32-LE') as f: + res = self.loadfunc(f) assert_array_equal(res, wanted) # Python2 .open does not support encoding - @np.testing.dec.skipif(MAJVER == 2) + @dec.skipif(MAJVER == 2) def test_compressed_gzip(self): self.check_compressed(gzip.open, ('.gz',)) - @np.testing.dec.skipif(MAJVER == 2 or not HAS_BZ2) + @dec.skipif(MAJVER == 2 or not HAS_BZ2) def test_compressed_gzip(self): self.check_compressed(bz2.open, ('.bz2',)) - @np.testing.dec.skipif(MAJVER == 2 or not HAS_LZMA) + @dec.skipif(MAJVER == 2 or not HAS_LZMA) def test_compressed_gzip(self): self.check_compressed(lzma.open, ('.xz', '.lzma')) @@ -565,7 +556,7 @@ class LoadTxtBase: with temppath() as path: with open(path, "wb") as f: f.write('0.\n1.\n2.'.encode("UTF-16")) - x = getattr(np, self.loadfunc)(path, encoding="UTF-16") + x = self.loadfunc(path, encoding="UTF-16") assert_array_equal(x, [0., 1., 2.]) def test_stringload(self): @@ -574,13 +565,12 @@ class LoadTxtBase: with temppath() as path: with open(path, "wb") as f: f.write(nonascii.encode("UTF-16")) - x = getattr(np, self.loadfunc)(path, encoding="UTF-16", dtype=np.unicode) + x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode) assert_array_equal(x, nonascii) def test_binary_decode(self): utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04' - v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=np.unicode, - encoding='UTF-16') + v = self.loadfunc(BytesIO(utf16), dtype=np.unicode, encoding='UTF-16') assert_array_equal(v, np.array(utf16.decode('UTF-16').split())) def test_converters_decode(self): @@ -588,8 +578,8 @@ class LoadTxtBase: c = TextIO() c.write(b'\xcf\x96') c.seek(0) - x = getattr(np, self.loadfunc)(c, dtype=np.unicode, - converters={0: lambda x: x.decode('UTF-8')}) + x = self.loadfunc(c, dtype=np.unicode, + converters={0: lambda x: x.decode('UTF-8')}) a = np.array([b'\xcf\x96'.decode('UTF-8')]) assert_array_equal(x, a) @@ -599,15 +589,16 @@ class LoadTxtBase: with temppath() as path: with io.open(path, 'wt', encoding='UTF-8') as f: f.write(utf8) - x = getattr(np, self.loadfunc)(path, dtype=np.unicode, - converters={0: lambda x: x + 't'}, - encoding='UTF-8') + x = self.loadfunc(path, dtype=np.unicode, + converters={0: lambda x: x + 't'}, + encoding='UTF-8') a = np.array([utf8 + 't']) assert_array_equal(x, a) class TestLoadTxt(LoadTxtBase): - loadfunc = 'loadtxt' + loadfunc = staticmethod(np.loadtxt) + def setUp(self): # lower chunksize for testing self.orig_chunk = np.lib.npyio._loadtxt_chunksize @@ -1016,7 +1007,7 @@ class TestLoadTxt(LoadTxtBase): dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)]) np.loadtxt(c, delimiter=',', dtype=dt, comments=None) # Should succeed - @np.testing.dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968') + @dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968') def test_binary_load(self): butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\ b"20,2,3,\xc3\x95scar\n\r" @@ -1087,7 +1078,8 @@ class Testfromregex(object): class TestFromTxt(LoadTxtBase): - loadfunc = 'genfromtxt' + loadfunc = staticmethod(np.genfromtxt) + def test_record(self): # Test w/ explicit dtype data = TextIO('1 2\n3 4') @@ -1933,8 +1925,7 @@ M 33 21.99 def test_binary_decode_autodtype(self): utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04' - v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=None, - encoding='UTF-16') + v = self.loadfunc(BytesIO(utf16), dtype=None, encoding='UTF-16') assert_array_equal(v, np.array(utf16.decode('UTF-16').split())) def test_utf8_byte_encoding(self): @@ -1975,28 +1966,40 @@ M 33 21.99 assert_equal(test['f0'], 0) assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8")) - @np.testing.dec.skipif(can_encode(b"\xcf\x96".decode('UTF-8'))) + def test_utf8_file_nodtype_unicode(self): # bytes encoding with non-latin1 -> unicode upcast - utf8 = b"\xcf\x96" - latin1 = b"\xf6\xfc\xf6" + utf8 = u'\u03d6' + latin1 = u'\xf6\xfc\xf6' + + # skip test if cannot encode utf8 test string with preferred + # encoding. The preferred encoding is assumed to be the default + # encoding of io.open. Will need to change this for PyTest, maybe + # using pytest.mark.xfail(raises=***). + try: + import locale + encoding = locale.getpreferredencoding() + utf8.encode(encoding) + except (UnicodeError, ImportError): + raise SkipTest('Skipping test_utf8_file_nodtype_unicode, ' + 'unable to encode utf8 in preferred encoding') + with temppath() as path: - with io.open(path, "wt", - encoding=locale.getpreferredencoding()) as f: + with io.open(path, "wt") as f: f.write(u"norm1,norm2,norm3\n") - f.write(u"norm1," + latin1.decode("latin1") + u",norm3\n") - f.write(u"test1,testNonethe" + utf8.decode("UTF-8") + - u",test3\n") + f.write(u"norm1," + latin1 + u",norm3\n") + f.write(u"test1,testNonethe" + utf8 + u",test3\n") with warnings.catch_warnings(record=True) as w: warnings.filterwarnings('always', '', np.VisibleDeprecationWarning) test = np.genfromtxt(path, dtype=None, comments=None, delimiter=',') + # Check for warning when encoding not specified. assert_(w[0].category is np.VisibleDeprecationWarning) ctl = np.array([ ["norm1", "norm2", "norm3"], - ["norm1", latin1.decode("latin1"), "norm3"], - ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]], + ["norm1", latin1, "norm3"], + ["test1", "testNonethe" + utf8, "test3"]], dtype=np.unicode) assert_array_equal(test, ctl) @@ -2174,7 +2177,7 @@ M 33 21.99 class TestPathUsage(object): # Test that pathlib.Path can be used - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_loadtxt(self): with temppath(suffix='.txt') as path: path = Path(path) @@ -2183,7 +2186,7 @@ class TestPathUsage(object): x = np.loadtxt(path) assert_array_equal(x, a) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_save_load(self): # Test that pathlib.Path instances can be used with savez. with temppath(suffix='.npy') as path: @@ -2193,7 +2196,7 @@ class TestPathUsage(object): data = np.load(path) assert_array_equal(data, a) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_savez_load(self): # Test that pathlib.Path instances can be used with savez. with temppath(suffix='.npz') as path: @@ -2202,7 +2205,7 @@ class TestPathUsage(object): with np.load(path) as data: assert_array_equal(data['lab'], 'place holder') - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_savez_compressed_load(self): # Test that pathlib.Path instances can be used with savez. with temppath(suffix='.npz') as path: @@ -2212,7 +2215,7 @@ class TestPathUsage(object): assert_array_equal(data['lab'], 'place holder') data.close() - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_genfromtxt(self): with temppath(suffix='.txt') as path: path = Path(path) @@ -2221,7 +2224,7 @@ class TestPathUsage(object): data = np.genfromtxt(path) assert_array_equal(a, data) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_ndfromtxt(self): # Test outputing a standard ndarray with temppath(suffix='.txt') as path: @@ -2233,7 +2236,7 @@ class TestPathUsage(object): test = np.ndfromtxt(path, dtype=int) assert_array_equal(test, control) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_mafromtxt(self): # From `test_fancy_dtype_alt` above with temppath(suffix='.txt') as path: @@ -2245,7 +2248,7 @@ class TestPathUsage(object): control = ma.array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)]) assert_equal(test, control) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_recfromtxt(self): with temppath(suffix='.txt') as path: path = Path(path) @@ -2259,7 +2262,7 @@ class TestPathUsage(object): assert_(isinstance(test, np.recarray)) assert_equal(test, control) - @np.testing.dec.skipif(Path is None, "No pathlib.Path") + @dec.skipif(Path is None, "No pathlib.Path") def test_recfromcsv(self): with temppath(suffix='.txt') as path: path = Path(path) -- cgit v1.2.1