From d8edc62e8c9e69280fb8a171c7678b2fea929696 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Mon, 3 Apr 2017 14:20:36 +0200
Subject: ENH: Add encoding option to numpy text IO.

This modifies loadtxt and genfromtxt in several ways intended to add
unicode support for text files by adding an `encoding` keyword to
np.load, np.genfromtxt, np.savetxt, and np.fromregex. The original
treatment of the relevant files was to open them as byte
files, whereas they are now opened as text files with an encoding. When
read, they are decoded to unicode strings for Python3 compatibility,
and when written, they are encoded as specified. For backward
compatibility, the default encoding in both cases is latin1.
---
 numpy/lib/_datasource.py         |  88 +++++++--
 numpy/lib/_iotools.py            |  62 +++----
 numpy/lib/npyio.py               | 346 ++++++++++++++++++++++++----------
 numpy/lib/tests/test__iotools.py | 123 ++++++------
 numpy/lib/tests/test_io.py       | 392 +++++++++++++++++++++++++++++++++++----
 5 files changed, 771 insertions(+), 240 deletions(-)

(limited to 'numpy/lib')

diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index 3affc5195..ad939df3f 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -15,7 +15,7 @@ DataSource files can originate locally or remotely:
 - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
 
 DataSource files can also be compressed or uncompressed.  Currently only
-gzip and bz2 are supported.
+gzip, bz2 and xz are supported.
 
 Example::
 
@@ -38,13 +38,59 @@ from __future__ import division, absolute_import, print_function
 import os
 import sys
 import shutil
+import io
 
 _open = open
 
+def _check_mode(mode, encoding, newline):
+    if "t" in mode:
+        if "b" in mode:
+            raise ValueError("Invalid mode: %r" % (mode,))
+    else:
+        if encoding is not None:
+            raise ValueError("Argument 'encoding' not supported in binary mode")
+        if newline is not None:
+            raise ValueError("Argument 'newline' not supported in binary mode")
+
+def _python2_bz2open(fn, mode, encoding, newline):
+    """ wrapper to open bz2 in text mode """
+    import bz2
+
+    _check_mode(mode, encoding, newline)
+
+    if "t" in mode:
+        # BZ2File is missing necessary functions for TextIOWrapper
+        raise ValueError("bz2 text files not supported in python2")
+    else:
+        return bz2.BZ2File(fn, mode)
+
+def _python2_gzipopen(fn, mode, encoding, newline):
+    """ wrapper to open gzip in text mode """
+    import gzip
+    # gzip is lacking read1 needed for TextIOWrapper
+    class GzipWrap(gzip.GzipFile):
+        def read1(self, n):
+            return self.read(n)
+
+    _check_mode(mode, encoding, newline)
+
+    gz_mode = mode.replace("t", "")
+    if isinstance(fn, (str, bytes)):
+        binary_file = GzipWrap(fn, gz_mode)
+    elif hasattr(fn, "read") or hasattr(fn, "write"):
+        binary_file = GzipWrap(None, gz_mode, fileobj=fn)
+    else:
+        raise TypeError("filename must be a str or bytes object, or a file")
+
+    if "t" in mode:
+        return io.TextIOWrapper(binary_file, encoding, newline=newline)
+    else:
+        return binary_file
+
 
 # Using a class instead of a module-level dictionary
 # to reduce the initial 'import numpy' overhead by
-# deferring the import of bz2 and gzip until needed
+# deferring the import of lzma, bz2 and gzip until needed
 
 # TODO: .zip support, .tar support?
 class _FileOpeners(object):
@@ -55,7 +101,7 @@ class _FileOpeners(object):
     supported file format. Attribute lookup is implemented in such a way
     that an instance of `_FileOpeners` itself can be indexed with the keys
     of that dictionary. Currently uncompressed files as well as files
-    compressed with ``gzip`` or ``bz2`` compression are supported.
+    compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
 
     Notes
     -----
@@ -65,7 +111,7 @@ class _FileOpeners(object):
     Examples
     --------
     >>> np.lib._datasource._file_openers.keys()
-    [None, '.bz2', '.gz']
+    [None, '.bz2', '.gz', '.xz', '.lzma']
     >>> np.lib._datasource._file_openers['.gz'] is gzip.open
     True
 
@@ -73,19 +119,31 @@ class _FileOpeners(object):
 
     def __init__(self):
         self._loaded = False
-        self._file_openers = {None: open}
+        self._file_openers = {None: io.open}
 
     def _load(self):
         if self._loaded:
             return
         try:
             import bz2
-            self._file_openers[".bz2"] = bz2.BZ2File
+            if sys.version_info[0] >= 3:
+                self._file_openers[".bz2"] = bz2.open
+            else:
+                self._file_openers[".bz2"] = _python2_bz2open
         except ImportError:
             pass
         try:
             import gzip
-            self._file_openers[".gz"] = gzip.open
+            if sys.version_info[0] >= 3:
+                self._file_openers[".gz"] = gzip.open
+            else:
+                self._file_openers[".gz"] = _python2_gzipopen
+        except ImportError:
+            pass
+        try:
+            import lzma
+            self._file_openers[".xz"] = lzma.open
+            self._file_openers[".lzma"] = lzma.open
         except ImportError:
             pass
         self._loaded = True
@@ -102,7 +160,7 @@ class _FileOpeners(object):
         -------
         keys : list
             The keys are None for uncompressed files and the file extension
-            strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression
+            strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
             methods.
 
         """
@@ -115,7 +173,7 @@ class _FileOpeners(object):
 
 _file_openers = _FileOpeners()
 
-def open(path, mode='r', destpath=os.curdir):
+def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
     """
     Open `path` with `mode` and return the file object.
 
@@ -148,7 +206,7 @@ def open(path, mode='r', destpath=os.curdir):
     """
 
     ds = DataSource(destpath)
-    return ds.open(path, mode)
+    return ds.open(path, mode, encoding=encoding, newline=newline)
 
 
 class DataSource (object):
@@ -458,7 +516,7 @@ class DataSource (object):
                 return False
         return False
 
-    def open(self, path, mode='r'):
+    def open(self, path, mode='r', encoding=None, newline=None):
         """
         Open and return file-like object.
 
@@ -496,7 +554,8 @@ class DataSource (object):
             _fname, ext = self._splitzipext(found)
             if ext == 'bz2':
                 mode.replace("+", "")
-            return _file_openers[ext](found, mode=mode)
+            return _file_openers[ext](found, mode=mode,
+                                      encoding=encoding, newline=newline)
         else:
             raise IOError("%s not found." % path)
 
@@ -619,7 +678,7 @@ class Repository (DataSource):
         """
         return DataSource.exists(self, self._fullpath(path))
 
-    def open(self, path, mode='r'):
+    def open(self, path, mode='r', encoding=None, newline=None):
         """
         Open and return file-like object prepending Repository base URL.
 
@@ -643,7 +702,8 @@ class Repository (DataSource):
             File object.
 
         """
-        return DataSource.open(self, self._fullpath(path), mode)
+        return DataSource.open(self, self._fullpath(path), mode,
+                               encoding=encoding, newline=newline)
 
     def listdir(self):
         """
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 1874c2e97..8e091d42d 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
 import sys
 import numpy as np
 import numpy.core.numeric as nx
-from numpy.compat import asbytes, bytes, asbytes_nested, basestring
+from numpy.compat import asbytes, asunicode, bytes, asbytes_nested, basestring
 
 if sys.version_info[0] >= 3:
     from builtins import bool, int, float, complex, object, str
@@ -17,15 +17,15 @@ else:
     from __builtin__ import bool, int, float, complex, object, unicode, str
 
 
-if sys.version_info[0] >= 3:
-    def _bytes_to_complex(s):
-        return complex(s.decode('ascii'))
+def _decode_line(line, encoding=None):
+    """ decode bytes from binary input streams, default to latin1 """
+    if type(line) is bytes:
+        if encoding is None:
+            line = line.decode('latin1')
+        else:
+            line = line.decode(encoding)
 
-    def _bytes_to_name(s):
-        return s.decode('ascii')
-else:
-    _bytes_to_complex = complex
-    _bytes_to_name = str
+    return line
 
 
 def _is_string_like(obj):
@@ -189,12 +189,10 @@ class LineSplitter(object):
         return lambda input: [_.strip() for _ in method(input)]
     #
 
-    def __init__(self, delimiter=None, comments=b'#', autostrip=True):
+    def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
         self.comments = comments
         # Delimiter is a character
-        if isinstance(delimiter, unicode):
-            delimiter = delimiter.encode('ascii')
-        if (delimiter is None) or _is_bytes_like(delimiter):
+        if (delimiter is None) or isinstance(delimiter, basestring):
             delimiter = delimiter or None
             _handyman = self._delimited_splitter
         # Delimiter is a list of field widths
@@ -213,12 +211,14 @@ class LineSplitter(object):
             self._handyman = self.autostrip(_handyman)
         else:
             self._handyman = _handyman
+        self.encoding = encoding
     #
 
     def _delimited_splitter(self, line):
+        """Chop off comments, strip, and split at delimiter. """
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(b" \r\n")
+        line = line.strip(" \r\n")
         if not line:
             return []
         return line.split(self.delimiter)
@@ -227,7 +227,7 @@ class LineSplitter(object):
     def _fixedwidth_splitter(self, line):
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(b"\r\n")
+        line = line.strip("\r\n")
         if not line:
             return []
         fixed = self.delimiter
@@ -245,7 +245,7 @@ class LineSplitter(object):
     #
 
     def __call__(self, line):
-        return self._handyman(line)
+        return self._handyman(_decode_line(line, self.encoding))
 
 
 class NameValidator(object):
@@ -434,9 +434,9 @@ def str2bool(value):
 
     """
     value = value.upper()
-    if value == b'TRUE':
+    if value == 'TRUE':
         return True
-    elif value == b'FALSE':
+    elif value == 'FALSE':
         return False
     else:
         raise ValueError("Invalid boolean")
@@ -527,9 +527,10 @@ class StringConverter(object):
         _mapper.append((nx.int64, int, -1))
 
     _mapper.extend([(nx.floating, float, nx.nan),
-                    (nx.complexfloating, _bytes_to_complex, nx.nan + 0j),
+                    (nx.complexfloating, complex, nx.nan + 0j),
                     (nx.longdouble, nx.longdouble, nx.nan),
-                    (nx.string_, bytes, b'???')])
+                    (nx.unicode_, asunicode, '???'),
+                    (nx.string_, asbytes, '???')])
 
     (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
 
@@ -601,11 +602,6 @@ class StringConverter(object):
 
     def __init__(self, dtype_or_func=None, default=None, missing_values=None,
                  locked=False):
-        # Convert unicode (for Py3)
-        if isinstance(missing_values, unicode):
-            missing_values = asbytes(missing_values)
-        elif isinstance(missing_values, (list, tuple)):
-            missing_values = asbytes_nested(missing_values)
         # Defines a lock for upgrade
         self._locked = bool(locked)
         # No input dtype: minimal initialization
@@ -631,7 +627,7 @@ class StringConverter(object):
                 # None
                 if default is None:
                     try:
-                        default = self.func(b'0')
+                        default = self.func('0')
                     except ValueError:
                         default = None
                 dtype = self._getdtype(default)
@@ -676,11 +672,11 @@ class StringConverter(object):
                     self.func = lambda x: int(float(x))
         # Store the list of strings corresponding to missing values.
         if missing_values is None:
-            self.missing_values = set([b''])
+            self.missing_values = set([''])
         else:
-            if isinstance(missing_values, bytes):
-                missing_values = missing_values.split(b",")
-            self.missing_values = set(list(missing_values) + [b''])
+            if isinstance(missing_values, basestring):
+                missing_values = missing_values.split(",")
+            self.missing_values = set(list(missing_values) + [''])
         #
         self._callingfunction = self._strict_call
         self.type = self._dtypeortype(dtype)
@@ -801,7 +797,7 @@ class StringConverter(object):
             self.iterupgrade(value)
 
     def update(self, func, default=None, testing_value=None,
-               missing_values=b'', locked=False):
+               missing_values='', locked=False):
         """
         Set StringConverter attributes directly.
 
@@ -838,13 +834,13 @@ class StringConverter(object):
             self.type = self._dtypeortype(self._getdtype(default))
         else:
             try:
-                tester = func(testing_value or b'1')
+                tester = func(testing_value or '1')
             except (TypeError, ValueError):
                 tester = None
             self.type = self._dtypeortype(self._getdtype(tester))
         # Add the missing values to the existing set
         if missing_values is not None:
-            if _is_bytes_like(missing_values):
+            if isinstance(missing_values, basestring):
                 self.missing_values.add(missing_values)
             elif hasattr(missing_values, '__iter__'):
                 for val in missing_values:
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6de5940d7..fe2aa436b 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import io
 import sys
 import os
 import re
@@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits
 from ._iotools import (
     LineSplitter, NameValidator, StringConverter, ConverterError,
     ConverterLockError, ConversionWarning, _is_string_like,
-    has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name
+    has_nested_fields, flatten_dtype, easy_dtype, _decode_line
     )
 
 from numpy.compat import (
-    asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path
+    asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode,
+    is_pathlib_path
     )
 
 if sys.version_info[0] >= 3:
@@ -731,7 +733,7 @@ def _getconv(dtype):
 
     def floatconv(x):
         x.lower()
-        if b'0x' in x:
+        if '0x' in x:
             return float.fromhex(asstr(x))
         return float(x)
 
@@ -752,13 +754,17 @@ def _getconv(dtype):
         return lambda x: complex(asstr(x))
     elif issubclass(typ, np.bytes_):
         return asbytes
+    elif issubclass(typ, np.unicode_):
+        return asunicode
     else:
         return asstr
 
+# amount of lines loadtxt reads in one chunk, can be overriden for testing
+_loadtxt_chunksize = 50000
 
 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             converters=None, skiprows=0, usecols=None, unpack=False,
-            ndmin=0):
+            ndmin=0, encoding='bytes'):
     """
     Load data from a text file.
 
@@ -813,6 +819,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         Legal values: 0 (default), 1 or 2.
 
         .. versionadded:: 1.6.0
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+        The special value 'bytes' enables backward compatibility workarounds
+        that ensures you receive byte arrays as results if possible and passes
+        latin1 encoded strings to converters. Override this value to receive
+        unicode arrays and pass strings as input to converters.
+        If set to None the system default is used.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -861,16 +876,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     # Type conversions for Py3 convenience
     if comments is not None:
         if isinstance(comments, (basestring, bytes)):
-            comments = [asbytes(comments)]
-        else:
-            comments = [asbytes(comment) for comment in comments]
+            comments = [comments]
+
+        comments = [_decode_line(x) for x in comments]
 
         # Compile regex for comments beforehand
         comments = (re.escape(comment) for comment in comments)
-        regex_comments = re.compile(b'|'.join(comments))
+        regex_comments = re.compile('|'.join(comments))
     user_converters = converters
-    if delimiter is not None:
-        delimiter = asbytes(delimiter)
+
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
 
     if usecols is not None:
         # Allow usecols to be a single int or a sequence of ints
@@ -896,22 +915,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         if is_pathlib_path(fname):
             fname = str(fname)
         if _is_string_like(fname):
+            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+            fencoding = getattr(fh, 'encoding', 'latin1')
+            fh = iter(fh)
             fown = True
-            if fname.endswith('.gz'):
-                import gzip
-                fh = iter(gzip.GzipFile(fname))
-            elif fname.endswith('.bz2'):
-                import bz2
-                fh = iter(bz2.BZ2File(fname))
-            elif sys.version_info[0] == 2:
-                fh = iter(open(fname, 'U'))
-            else:
-                fh = iter(open(fname))
         else:
             fh = iter(fname)
+            fencoding = getattr(fname, 'encoding', 'latin1')
     except TypeError:
         raise ValueError('fname must be a string, file handle, or generator')
-    X = []
+
+    # input may be a python2 io stream
+    if encoding is not None:
+        fencoding = encoding
+    # we must assume local encoding
+    # TOOD emit portability warning?
+    elif fencoding is None:
+        import locale
+        fencoding = locale.getpreferredencoding()
 
     # not to be confused with the flatten_dtype we import...
     def flatten_dtype_internal(dt):
@@ -960,21 +981,43 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             return tuple(ret)
 
     def split_line(line):
-        """Chop off comments, strip, and split at delimiter.
-
-        Note that although the file is opened as text, this function
-        returns bytes.
+        """Chop off comments, strip, and split at delimiter. """
+        line = _decode_line(line, encoding=encoding)
 
-        """
-        line = asbytes(line)
         if comments is not None:
-            line = regex_comments.split(asbytes(line), maxsplit=1)[0]
-        line = line.strip(b'\r\n')
+            line = regex_comments.split(line, maxsplit=1)[0]
+        line = line.strip('\r\n')
         if line:
             return line.split(delimiter)
         else:
             return []
 
+    def read_data(chunk_size):
+        # Parse each line, including the first
+        X = []
+        for i, line in enumerate(itertools.chain([first_line], fh)):
+            vals = split_line(line)
+            if len(vals) == 0:
+                continue
+            if usecols:
+                vals = [vals[j] for j in usecols]
+            if len(vals) != N:
+                line_num = i + skiprows + 1
+                raise ValueError("Wrong number of columns at line %d"
+                                 % line_num)
+
+            # Convert each value according to its column and store
+            items = [conv(val) for (conv, val) in zip(converters, vals)]
+
+            # Then pack it according to the dtype's nesting
+            items = pack_items(items, packing)
+            X.append(items)
+            if len(X) > chunk_size:
+                yield X
+                X = []
+        if X:
+            yield X
+
     try:
         # Make sure we're dealing with a proper dtype
         dtype = np.dtype(dtype)
@@ -1017,30 +1060,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                 except ValueError:
                     # Unused converter specified
                     continue
-            converters[i] = conv
-
-        # Parse each line, including the first
-        for i, line in enumerate(itertools.chain([first_line], fh)):
-            vals = split_line(line)
-            if len(vals) == 0:
-                continue
-            if usecols:
-                vals = [vals[j] for j in usecols]
-            if len(vals) != N:
-                line_num = i + skiprows + 1
-                raise ValueError("Wrong number of columns at line %d"
-                                 % line_num)
-
-            # Convert each value according to its column and store
-            items = [conv(val) for (conv, val) in zip(converters, vals)]
-            # Then pack it according to the dtype's nesting
-            items = pack_items(items, packing)
-            X.append(items)
+            if byte_converters:
+                # converters may use decode to workaround numpy's oldd behaviour,
+                # so encode the string again before passing to the user converter
+                def tobytes_first(x, conv):
+                    if type(x) is bytes:
+                        return conv(x)
+                    return conv(x.encode("latin1"))
+                import functools
+                converters[i] = functools.partial(tobytes_first, conv=conv)
+            else:
+                converters[i] = conv
+
+        converters = [conv if conv is not bytes else
+                      lambda x: x.encode(fencoding) for conv in converters]
+
+        # read data in chunks and fill it into an array via resize
+        # over-allocating and shrinking the array later may be faster but is
+        # probably not relevant compared to the cost of actually reading and
+        # converting the data
+        X = None
+        for x in read_data(_loadtxt_chunksize):
+            if X is None:
+                X = np.array(x, dtype)
+            else:
+                nshape = list(X.shape)
+                pos = nshape[0]
+                nshape[0] += len(x)
+                X.resize(nshape)
+                X[pos:, ...] = x
     finally:
         if fown:
             fh.close()
 
-    X = np.array(X, dtype)
+    if X is None:
+        X = np.array([], dtype)
+
     # Multicolumn data are returned with shape (1, N, M), i.e.
     # (1, 1, M) for a single row - remove the singleton dimension there
     if X.ndim == 3 and X.shape[:2] == (1, 1):
@@ -1072,7 +1127,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
 
 def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
-            footer='', comments='# '):
+            footer='', comments='# ', encoding=None):
     """
     Save an array to a text file.
 
@@ -1116,6 +1171,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         ``numpy.loadtxt``.
 
         .. versionadded:: 1.7.0
+    encoding: string, optional
+        Encoding used to encode the outputfile. Does not apply to output
+        streams.
+
+        .. versionadded:: 1.14.0
 
 
     See Also
@@ -1190,21 +1250,51 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         fmt = asstr(fmt)
     delimiter = asstr(delimiter)
 
+    class WriteWrap(object):
+        """ convert to unicode in py2 or to bytes on bytestream inputs """
+        def __init__(self, fh, encoding):
+            self.fh = fh
+            self.encoding = encoding
+            self.do_write = self.first_write
+
+        def close(self):
+            self.fh.close()
+
+        def write(self, v):
+            self.do_write(v)
+
+        def write_bytes(self, v):
+            if isinstance(v, bytes):
+                self.fh.write(v)
+            else:
+                self.fh.write(v.encode(self.encoding))
+
+        def write_normal(self, v):
+            self.fh.write(asunicode(v))
+
+        def first_write(self, v):
+            try:
+                self.write_normal(v)
+                self.write = self.write_normal
+            except TypeError:
+                # input is probably a bytestream
+                self.write_bytes(v)
+                self.write = self.write_bytes
+
     own_fh = False
     if is_pathlib_path(fname):
         fname = str(fname)
     if _is_string_like(fname):
+        # datasource doesn't support creating a new file ...
+        open(fname, 'wt').close()
+        fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
         own_fh = True
-        if fname.endswith('.gz'):
-            import gzip
-            fh = gzip.open(fname, 'wb')
-        else:
-            if sys.version_info[0] >= 3:
-                fh = open(fname, 'wb')
-            else:
-                fh = open(fname, 'w')
+        # need to convert str to unicode for text io output
+        if sys.version_info[0] == 2:
+            fh = WriteWrap(fh, encoding or 'latin1')
     elif hasattr(fname, 'write'):
-        fh = fname
+        # wrap to handle byte output streams
+        fh = WriteWrap(fname, encoding or 'latin1')
     else:
         raise ValueError('fname must be a string or file handle')
 
@@ -1254,31 +1344,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
 
         if len(header) > 0:
             header = header.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + header + newline))
+            fh.write(comments + header + newline)
         if iscomplex_X:
             for row in X:
                 row2 = []
                 for number in row:
                     row2.append(number.real)
                     row2.append(number.imag)
-                fh.write(asbytes(format % tuple(row2) + newline))
+                fh.write(format % tuple(row2) + newline)
         else:
             for row in X:
                 try:
-                    fh.write(asbytes(format % tuple(row) + newline))
+                    v = format % tuple(row) + newline
                 except TypeError:
                     raise TypeError("Mismatch between array dtype ('%s') and "
                                     "format specifier ('%s')"
                                     % (str(X.dtype), format))
+                fh.write(v)
+
         if len(footer) > 0:
             footer = footer.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + footer + newline))
+            fh.write(comments + footer + newline)
     finally:
         if own_fh:
             fh.close()
 
 
-def fromregex(file, regexp, dtype):
+def fromregex(file, regexp, dtype, encoding=None):
     """
     Construct an array from a text file, using regular expression parsing.
 
@@ -1295,6 +1387,10 @@ def fromregex(file, regexp, dtype):
         Groups in the regular expression correspond to fields in the dtype.
     dtype : dtype or list of dtypes
         Dtype for the structured array.
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1335,16 +1431,22 @@ def fromregex(file, regexp, dtype):
     """
     own_fh = False
     if not hasattr(file, "read"):
-        file = open(file, 'rb')
+        file = np.lib._datasource.open(file, 'rt', encoding=encoding)
         own_fh = True
 
     try:
-        if not hasattr(regexp, 'match'):
-            regexp = re.compile(asbytes(regexp))
         if not isinstance(dtype, np.dtype):
             dtype = np.dtype(dtype)
 
-        seq = regexp.findall(file.read())
+        content = file.read()
+        if isinstance(content, bytes) and not isinstance(regexp, bytes):
+            regexp = asbytes(regexp)
+        elif not isinstance(content, bytes) and isinstance(regexp, bytes):
+            regexp = asstr(regexp)
+
+        if not hasattr(regexp, 'match'):
+            regexp = re.compile(regexp)
+        seq = regexp.findall(content)
         if seq and not isinstance(seq[0], tuple):
             # Only one group is in the regexp.
             # Create the new array as a single data-type and then
@@ -1372,7 +1474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                names=None, excludelist=None, deletechars=None,
                replace_space='_', autostrip=False, case_sensitive=True,
                defaultfmt="f%i", unpack=None, usemask=False, loose=True,
-               invalid_raise=True, max_rows=None):
+               invalid_raise=True, max_rows=None, encoding='bytes'):
     """
     Load data from a text file, with missing values handled as specified.
 
@@ -1460,6 +1562,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         to read the entire file.
 
         .. versionadded:: 1.10.0
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+        The special value 'bytes' enables backward compatibility workarounds
+        that ensures you receive byte arrays as results if possible and passes
+        latin1 encoded strings to converters. Override this value to receive
+        unicode arrays and pass strings as input to converters.
+        If set to None the system default is used.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1536,15 +1647,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         if max_rows < 1:
             raise ValueError("'max_rows' must be at least 1.")
 
-    # Py3 data conversions to bytes, for convenience
-    if comments is not None:
-        comments = asbytes(comments)
-    if isinstance(delimiter, unicode):
-        delimiter = asbytes(delimiter)
-    if isinstance(missing_values, (unicode, list, tuple)):
-        missing_values = asbytes_nested(missing_values)
-
-    #
     if usemask:
         from numpy.ma import MaskedArray, make_mask_descr
     # Check the input dictionary of converters
@@ -1554,16 +1656,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "The input argument 'converter' should be a valid dictionary "
             "(got '%s' instead)" % type(user_converters))
 
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
+
     # Initialize the filehandle, the LineSplitter and the NameValidator
     own_fhd = False
     try:
         if is_pathlib_path(fname):
             fname = str(fname)
         if isinstance(fname, basestring):
-            if sys.version_info[0] == 2:
-                fhd = iter(np.lib._datasource.open(fname, 'rbU'))
-            else:
-                fhd = iter(np.lib._datasource.open(fname, 'rb'))
+            fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding))
             own_fhd = True
         else:
             fhd = iter(fname)
@@ -1573,7 +1678,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "or generator. Got %s instead." % type(fname))
 
     split_line = LineSplitter(delimiter=delimiter, comments=comments,
-                              autostrip=autostrip)._handyman
+                              autostrip=autostrip, encoding=encoding)
     validate_names = NameValidator(excludelist=excludelist,
                                    deletechars=deletechars,
                                    case_sensitive=case_sensitive,
@@ -1587,15 +1692,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     first_values = None
     try:
         while not first_values:
-            first_line = next(fhd)
+            first_line = _decode_line(next(fhd), encoding)
             if names is True:
                 if comments in first_line:
                     first_line = (
-                        b''.join(first_line.split(comments)[1:]))
+                        ''.join(first_line.split(comments)[1:]))
             first_values = split_line(first_line)
     except StopIteration:
         # return an empty array if the datafile is empty
-        first_line = b''
+        first_line = ''
         first_values = []
         warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
 
@@ -1618,9 +1723,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     # Check the names and overwrite the dtype.names if needed
     if names is True:
-        names = validate_names([_bytes_to_name(_.strip())
-                                for _ in first_values])
-        first_line = b''
+        names = validate_names([str(_.strip()) for _ in first_values])
+        first_line = ''
     elif _is_string_like(names):
         names = validate_names([_.strip() for _ in names.split(',')])
     elif names:
@@ -1657,9 +1761,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     # Process the missing values ...............................
     # Rename missing_values for convenience
     user_missing_values = missing_values or ()
+    if isinstance(user_missing_values, bytes):
+        user_missing_values = user_missing_values.decode('latin1')
 
     # Define the list of missing_values (one column: one list)
-    missing_values = [list([b'']) for _ in range(nbcols)]
+    missing_values = [list(['']) for _ in range(nbcols)]
 
     # We have a dictionary: process it field by field
     if isinstance(user_missing_values, dict):
@@ -1698,8 +1804,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             if value not in entry:
                 entry.append(value)
     # We have a string : apply it to all entries
-    elif isinstance(user_missing_values, bytes):
-        user_value = user_missing_values.split(b",")
+    elif isinstance(user_missing_values, basestring):
+        user_value = user_missing_values.split(",")
         for entry in missing_values:
             entry.extend(user_value)
     # We have something else: apply it to all entries
@@ -1787,11 +1893,24 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             testing_value = first_values[j]
         else:
             testing_value = None
-        converters[i].update(conv, locked=True,
+        if conv is bytes:
+            user_conv = asbytes
+        elif byte_converters:
+            # converters may use decode to workaround numpy's oldd behaviour,
+            # so encode the string again before passing to the user converter
+            def tobytes_first(x, conv):
+                if type(x) is bytes:
+                    return conv(x)
+                return conv(x.encode("latin1"))
+            import functools
+            user_conv = functools.partial(tobytes_first, conv=conv)
+        else:
+            user_conv = conv
+        converters[i].update(user_conv, locked=True,
                              testing_value=testing_value,
                              default=filling_values[i],
                              missing_values=missing_values[i],)
-        uc_update.append((i, conv))
+        uc_update.append((i, user_conv))
     # Make sure we have the corrected keys in user_converters...
     user_converters.update(uc_update)
 
@@ -1908,16 +2027,43 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         column_types = [conv.type for conv in converters]
         # Find the columns with strings...
         strcolidx = [i for (i, v) in enumerate(column_types)
-                     if v in (type('S'), np.string_)]
+                     if v == np.unicode_]
+
+        typestr = 'U'
+        if byte_converters and strcolidx:
+            # convert strings back to bytes for backward compatibility
+            warnings.warn(
+                "Reading strings without specifying the encoding argument is "
+                "deprecated. Set encoding, use None for the system default.",
+                np.VisibleDeprecationWarning, stacklevel=2)
+            try:
+                for j in range(len(data)):
+                    row = list(data[j])
+                    for i in strcolidx:
+                        row[i] = row[i].encode('latin1')
+                    data[j] = tuple(row)
+                typestr = 'S'
+            except UnicodeEncodeError:
+                # we must use unicode, revert encoding
+                for k in range(0, j + 1):
+                    row = list(data[k])
+                    for i in strcolidx:
+                        if isinstance(row[i], bytes):
+                            row[i] = row[i].decode('latin1')
+                    data[k] = tuple(row)
+
         # ... and take the largest number of chars.
         for i in strcolidx:
-            column_types[i] = "|S%i" % max(len(row[i]) for row in data)
+            column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data))
         #
         if names is None:
             # If the dtype is uniform, don't define names, else use ''
             base = set([c.type for c in converters if c._checked])
             if len(base) == 1:
-                (ddtype, mdtype) = (list(base)[0], bool)
+                if strcolidx:
+                    (ddtype, mdtype) = (typestr, bool)
+                else:
+                    (ddtype, mdtype) = (list(base)[0], bool)
             else:
                 ddtype = [(defaultfmt % i, dt)
                           for (i, dt) in enumerate(column_types)]
@@ -1966,8 +2112,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                     # Keep the dtype of the current converter
                     if i in user_converters:
                         ishomogeneous &= (ttype == dtype.type)
-                        if ttype == np.string_:
-                            ttype = "|S%i" % max(len(row[i]) for row in data)
+                        if np.issubdtype(ttype, np.character):
+                            ttype = (ttype, max(len(row[i]) for row in data))
                         descr.append(('', ttype))
                     else:
                         descr.append(('', dtype))
@@ -1992,7 +2138,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     if usemask and names:
         for (name, conv) in zip(names or (), converters):
             missing_values = [conv(_) for _ in conv.missing_values
-                              if _ != b'']
+                              if _ != '']
             for mval in missing_values:
                 outputmask[name] |= (output[name] == mval)
     # Construct the final array
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 03192896c..990ee126d 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -19,61 +19,61 @@ class TestLineSplitter(object):
 
     def test_no_delimiter(self):
         "Test LineSplitter w/o delimiter"
-        strg = b" 1 2 3 4  5 # test"
+        strg = " 1 2 3 4  5 # test"
         test = LineSplitter()(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5'])
+        assert_equal(test, ['1', '2', '3', '4', '5'])
         test = LineSplitter('')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5'])
+        assert_equal(test, ['1', '2', '3', '4', '5'])
 
     def test_space_delimiter(self):
         "Test space delimiter"
-        strg = b" 1 2 3 4  5 # test"
-        test = LineSplitter(b' ')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
-        test = LineSplitter(b'  ')(strg)
-        assert_equal(test, [b'1 2 3 4', b'5'])
+        strg = " 1 2 3 4  5 # test"
+        test = LineSplitter(' ')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+        test = LineSplitter('  ')(strg)
+        assert_equal(test, ['1 2 3 4', '5'])
 
     def test_tab_delimiter(self):
         "Test tab delimiter"
-        strg = b" 1\t 2\t 3\t 4\t 5  6"
-        test = LineSplitter(b'\t')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5  6'])
-        strg = b" 1  2\t 3  4\t 5  6"
-        test = LineSplitter(b'\t')(strg)
-        assert_equal(test, [b'1  2', b'3  4', b'5  6'])
+        strg = " 1\t 2\t 3\t 4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '5  6'])
+        strg = " 1  2\t 3  4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1  2', '3  4', '5  6'])
 
     def test_other_delimiter(self):
         "Test LineSplitter on delimiter"
-        strg = b"1,2,3,4,,5"
-        test = LineSplitter(b',')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
+        strg = "1,2,3,4,,5"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
         #
-        strg = b" 1,2,3,4,,5 # test"
-        test = LineSplitter(b',')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
+        strg = " 1,2,3,4,,5 # test"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
 
     def test_constant_fixed_width(self):
         "Test LineSplitter w/ fixed-width fields"
-        strg = b"  1  2  3  4     5   # test"
+        strg = "  1  2  3  4     5   # test"
         test = LineSplitter(3)(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5', b''])
+        assert_equal(test, ['1', '2', '3', '4', '', '5', ''])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter(20)(strg)
-        assert_equal(test, [b'1     3  4  5  6'])
+        assert_equal(test, ['1     3  4  5  6'])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter(30)(strg)
-        assert_equal(test, [b'1     3  4  5  6'])
+        assert_equal(test, ['1     3  4  5  6'])
 
     def test_variable_fixed_width(self):
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter((3, 6, 6, 3))(strg)
-        assert_equal(test, [b'1', b'3', b'4  5', b'6'])
+        assert_equal(test, ['1', '3', '4  5', '6'])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter((6, 6, 9))(strg)
-        assert_equal(test, [b'1', b'3  4', b'5  6'])
+        assert_equal(test, ['1', '3  4', '5  6'])
 
 # -----------------------------------------------------------------------------
 
@@ -133,10 +133,9 @@ class TestNameValidator(object):
 
 
 def _bytes_to_date(s):
-    if sys.version_info[0] >= 3:
-        return date(*time.strptime(s.decode('latin1'), "%Y-%m-%d")[:3])
-    else:
-        return date(*time.strptime(s, "%Y-%m-%d")[:3])
+    if type(s) == bytes:
+        s = s.decode("latin1")
+    return date(*time.strptime(s, "%Y-%m-%d")[:3])
 
 
 class TestStringConverter(object):
@@ -155,7 +154,7 @@ class TestStringConverter(object):
         assert_equal(converter._status, 0)
 
         # test int
-        assert_equal(converter.upgrade(b'0'), 0)
+        assert_equal(converter.upgrade('0'), 0)
         assert_equal(converter._status, 1)
 
         # On systems where integer defaults to 32-bit, the statuses will be
@@ -164,30 +163,30 @@ class TestStringConverter(object):
         status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize)
 
         # test int > 2**32
-        assert_equal(converter.upgrade(b'17179869184'), 17179869184)
+        assert_equal(converter.upgrade('17179869184'), 17179869184)
         assert_equal(converter._status, 1 + status_offset)
 
         # test float
-        assert_allclose(converter.upgrade(b'0.'), 0.0)
+        assert_allclose(converter.upgrade('0.'), 0.0)
         assert_equal(converter._status, 2 + status_offset)
 
         # test complex
-        assert_equal(converter.upgrade(b'0j'), complex('0j'))
+        assert_equal(converter.upgrade('0j'), complex('0j'))
         assert_equal(converter._status, 3 + status_offset)
 
-        # test str
-        assert_equal(converter.upgrade(b'a'), b'a')
-        assert_equal(converter._status, len(converter._mapper) - 1)
+        # test str TODO
+        #assert_equal(converter.upgrade(b'a'), b'a')
+        #assert_equal(converter._status, len(converter._mapper) - 1)
 
     def test_missing(self):
         "Tests the use of missing values."
-        converter = StringConverter(missing_values=(b'missing',
-                                                    b'missed'))
-        converter.upgrade(b'0')
-        assert_equal(converter(b'0'), 0)
-        assert_equal(converter(b''), converter.default)
-        assert_equal(converter(b'missing'), converter.default)
-        assert_equal(converter(b'missed'), converter.default)
+        converter = StringConverter(missing_values=('missing',
+                                                    'missed'))
+        converter.upgrade('0')
+        assert_equal(converter('0'), 0)
+        assert_equal(converter(''), converter.default)
+        assert_equal(converter('missing'), converter.default)
+        assert_equal(converter('missed'), converter.default)
         try:
             converter('miss')
         except ValueError:
@@ -198,58 +197,58 @@ class TestStringConverter(object):
         dateparser = _bytes_to_date
         StringConverter.upgrade_mapper(dateparser, date(2000, 1, 1))
         convert = StringConverter(dateparser, date(2000, 1, 1))
-        test = convert(b'2001-01-01')
+        test = convert('2001-01-01')
         assert_equal(test, date(2001, 1, 1))
-        test = convert(b'2009-01-01')
+        test = convert('2009-01-01')
         assert_equal(test, date(2009, 1, 1))
-        test = convert(b'')
+        test = convert('')
         assert_equal(test, date(2000, 1, 1))
 
     def test_string_to_object(self):
         "Make sure that string-to-object functions are properly recognized"
         conv = StringConverter(_bytes_to_date)
-        assert_equal(conv._mapper[-2][0](0), 0j)
+        assert_equal(conv._mapper[-3][0](0), 0j)
         assert_(hasattr(conv, 'default'))
 
     def test_keep_default(self):
         "Make sure we don't lose an explicit default"
-        converter = StringConverter(None, missing_values=b'',
+        converter = StringConverter(None, missing_values='',
                                     default=-999)
-        converter.upgrade(b'3.14159265')
+        converter.upgrade('3.14159265')
         assert_equal(converter.default, -999)
         assert_equal(converter.type, np.dtype(float))
         #
         converter = StringConverter(
-            None, missing_values=b'', default=0)
-        converter.upgrade(b'3.14159265')
+            None, missing_values='', default=0)
+        converter.upgrade('3.14159265')
         assert_equal(converter.default, 0)
         assert_equal(converter.type, np.dtype(float))
 
     def test_keep_default_zero(self):
         "Check that we don't lose a default of 0"
         converter = StringConverter(int, default=0,
-                                    missing_values=b"N/A")
+                                    missing_values="N/A")
         assert_equal(converter.default, 0)
 
     def test_keep_missing_values(self):
         "Check that we're not losing missing values"
         converter = StringConverter(int, default=0,
-                                    missing_values=b"N/A")
+                                    missing_values="N/A")
         assert_equal(
-            converter.missing_values, set([b'', b'N/A']))
+            converter.missing_values, set(['', 'N/A']))
 
     def test_int64_dtype(self):
         "Check that int64 integer types can be specified"
         converter = StringConverter(np.int64, default=0)
-        val = b"-9223372036854775807"
+        val = "-9223372036854775807"
         assert_(converter(val) == -9223372036854775807)
-        val = b"9223372036854775807"
+        val = "9223372036854775807"
         assert_(converter(val) == 9223372036854775807)
 
     def test_uint64_dtype(self):
         "Check that uint64 integer types can be specified"
         converter = StringConverter(np.uint64, default=0)
-        val = b"9223372043271415339"
+        val = "9223372043271415339"
         assert_(converter(val) == 9223372043271415339)
 
 
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 6f7fcc54c..35c37c7be 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -8,8 +8,11 @@ from tempfile import NamedTemporaryFile
 import time
 import warnings
 import gc
-from io import BytesIO
+import io
+from io import BytesIO, StringIO
 from datetime import datetime
+import locale
+import re
 
 import numpy as np
 import numpy.ma as ma
@@ -17,11 +20,19 @@ from numpy.lib._iotools import ConverterError, ConversionWarning
 from numpy.compat import asbytes, bytes, unicode, Path
 from numpy.ma.testutils import assert_equal
 from numpy.testing import (
-    run_module_suite, assert_warns, assert_, assert_raises_regex,
-    assert_raises, assert_allclose, assert_array_equal, temppath, dec, IS_PYPY,
-    suppress_warnings
+    run_module_suite, assert_warns, assert_,
+    assert_raises_regex, assert_raises, assert_allclose,
+    assert_array_equal, temppath, tempdir, dec, IS_PYPY, suppress_warnings
 )
 
+def can_encode(v):
+    """ check if bytes can be decoded with default encoding """
+    try:
+        v.encode(locale.getpreferredencoding())
+        return False # no skipping
+    except UnicodeEncodeError:
+        return True
+
 
 class TextIO(BytesIO):
     """Helper IO class.
@@ -44,6 +55,16 @@ class TextIO(BytesIO):
 
 MAJVER, MINVER = sys.version_info[:2]
 IS_64BIT = sys.maxsize > 2**32
+try:
+    import bz2
+    HAS_BZ2 = True
+except ImportError:
+    HAS_BZ2 = False
+try:
+    import lzma
+    HAS_LZMA = True
+except ImportError:
+    HAS_LZMA = False
 
 
 def strptime(s, fmt=None):
@@ -52,10 +73,9 @@ def strptime(s, fmt=None):
     2.5.
 
     """
-    if sys.version_info[0] >= 3:
-        return datetime(*time.strptime(s.decode('latin1'), fmt)[:3])
-    else:
-        return datetime(*time.strptime(s, fmt)[:3])
+    if type(s) == bytes:
+        s = s.decode("latin1")
+    return datetime(*time.strptime(s, fmt)[:3])
 
 
 class RoundtripTest(object):
@@ -466,8 +486,135 @@ class TestSaveTxt(object):
         b = np.loadtxt(w)
         assert_array_equal(a, b)
 
+    def test_unicode(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        with tempdir() as tmpdir:
+            # set encoding as on windows it may not be unicode even on py3
+            np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'],
+                       encoding='UTF-8')
+
+    def test_unicode_roundtrip(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        # our gz wrapper support encoding
+        suffixes = ['', '.gz']
+        # stdlib 2 versions do not support encoding
+        if MAJVER > 2:
+            if HAS_BZ2:
+                suffixes.append('.bz2')
+            if HAS_LZMA:
+                suffixes.extend(['.xz', '.lzma'])
+        with tempdir() as tmpdir:
+            for suffix in suffixes:
+                np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a,
+                           fmt=['%s'], encoding='UTF-16-LE')
+                b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix),
+                               encoding='UTF-16-LE', dtype=np.unicode)
+                assert_array_equal(a, b)
+
+    def test_unicode_bytestream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = BytesIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read().decode('UTF-8'), utf8 + '\n')
+
+    def test_unicode_stringstream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = StringIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read(), utf8 + '\n')
+
+
+class LoadTxtBase:
+    def check_compressed(self, fopen, suffixes):
+        # Test that we can load data from a compressed file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            for suffix in suffixes:
+                with temppath(suffix=suffix) as name:
+                    with fopen(name, mode='wt', encoding='UTF-32-LE') as f:
+                        f.write(data)
+                    res = getattr(np, self.loadfunc)(name,
+                                                     encoding='UTF-32-LE')
+                    assert_array_equal(res, wanted)
+                    res = getattr(np, self.loadfunc)(
+                                 fopen(name, "rt", encoding='UTF-32-LE'))
+                    assert_array_equal(res, wanted)
+
+    # Python2 .open does not support encoding
+    @np.testing.dec.skipif(MAJVER == 2)
+    def test_compressed_gzip(self):
+        self.check_compressed(gzip.open, ('.gz',))
+
+    @np.testing.dec.skipif(MAJVER == 2 or not HAS_BZ2)
+    def test_compressed_gzip(self):
+        self.check_compressed(bz2.open, ('.bz2',))
+
+    @np.testing.dec.skipif(MAJVER == 2 or not HAS_LZMA)
+    def test_compressed_gzip(self):
+        self.check_compressed(lzma.open, ('.xz', '.lzma'))
+
+    def test_encoding(self):
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write('0.\n1.\n2.'.encode("UTF-16"))
+            x = getattr(np, self.loadfunc)(path, encoding="UTF-16")
+            assert_array_equal(x, [0., 1., 2.])
+
+    def test_stringload(self):
+        # umlaute
+        nonascii = b'\xc3\xb6\xc3\xbc\xc3\xb6'.decode("UTF-8")
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(nonascii.encode("UTF-16"))
+            x = getattr(np, self.loadfunc)(path, encoding="UTF-16", dtype=np.unicode)
+            assert_array_equal(x, nonascii)
+
+    def test_binary_decode(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=np.unicode,
+                                       encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_converters_decode(self):
+        # test converters that decode strings
+        c = TextIO()
+        c.write(b'\xcf\x96')
+        c.seek(0)
+        x = getattr(np, self.loadfunc)(c, dtype=np.unicode,
+                       converters={0: lambda x: x.decode('UTF-8')})
+        a = np.array([b'\xcf\x96'.decode('UTF-8')])
+        assert_array_equal(x, a)
+
+    def test_converters_nodecode(self):
+        # test native string converters enabled by setting an encoding
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        with temppath() as path:
+            with io.open(path, 'wt', encoding='UTF-8') as f:
+                f.write(utf8)
+            x = getattr(np, self.loadfunc)(path, dtype=np.unicode,
+                                           converters={0: lambda x: x + 't'},
+                                           encoding='UTF-8')
+            a = np.array([utf8 + 't'])
+            assert_array_equal(x, a)
+
+
+class TestLoadTxt(LoadTxtBase):
+    loadfunc = 'loadtxt'
+    def setUp(self):
+        # lower chunksize for testing
+        self.orig_chunk = np.lib.npyio._loadtxt_chunksize
+        np.lib.npyio._loadtxt_chunksize = 1
+    def tearDown(self):
+        np.lib.npyio._loadtxt_chunksize = self.orig_chunk
 
-class TestLoadTxt(object):
     def test_record(self):
         c = TextIO()
         c.write('1 2\n3 4')
@@ -869,9 +1016,24 @@ class TestLoadTxt(object):
         dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)])
         np.loadtxt(c, delimiter=',', dtype=dt, comments=None)  # Should succeed
 
+    @np.testing.dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968')
+    def test_binary_load(self):
+        butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\
+                b"20,2,3,\xc3\x95scar\n\r"
+        sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines()
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(butf8)
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode)
+            assert_array_equal(x, sutf8)
+            # test broken latin1 conversion people now rely on
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype="S")
+            x = [b'5,6,7,\xc3\x95scarscar', b'15,2,3,hello', b'20,2,3,\xc3\x95scar']
+            assert_array_equal(x, np.array(x, dtype="S"))
 
 class Testfromregex(object):
-    # np.fromregex expects files opened in binary mode.
     def test_record(self):
         c = TextIO()
         c.write('1.312 foo\n1.534 bar\n4.444 qux')
@@ -904,12 +1066,28 @@ class Testfromregex(object):
         a = np.array([(1312,), (1534,), (4444,)], dtype=dt)
         assert_array_equal(x, a)
 
+    def test_record_unicode(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'1.312 foo' + utf8 + b' \n1.534 bar\n4.444 qux')
+
+            dt = [('num', np.float64), ('val', 'U4')]
+            x = np.fromregex(path, r"(?u)([0-9.]+)\s+(\w+)", dt, encoding='UTF-8')
+            a = np.array([(1.312, 'foo' + utf8.decode('UTF-8')), (1.534, 'bar'),
+                           (4.444, 'qux')], dtype=dt)
+            assert_array_equal(x, a)
+
+            regexp = re.compile(r"([0-9.]+)\s+(\w+)", re.UNICODE)
+            x = np.fromregex(path, regexp, dt, encoding='UTF-8')
+            assert_array_equal(x, a)
+
 
 #####--------------------------------------------------------------------------
 
 
-class TestFromTxt(object):
-    #
+class TestFromTxt(LoadTxtBase):
+    loadfunc = 'genfromtxt'
     def test_record(self):
         # Test w/ explicit dtype
         data = TextIO('1 2\n3 4')
@@ -1012,7 +1190,10 @@ class TestFromTxt(object):
     def test_header(self):
         # Test retrieving a header
         data = TextIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0')
-        test = np.ndfromtxt(data, dtype=None, names=True)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None, names=True)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = {'gender': np.array([b'M', b'F']),
                    'age': np.array([64.0, 25.0]),
                    'weight': np.array([75.0, 60.0])}
@@ -1023,7 +1204,10 @@ class TestFromTxt(object):
     def test_auto_dtype(self):
         # Test the automatic definition of the output dtype
         data = TextIO('A 64 75.0 3+4j True\nBCD 25 60.0 5+6j False')
-        test = np.ndfromtxt(data, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = [np.array([b'A', b'BCD']),
                    np.array([64, 25]),
                    np.array([75.0, 60.0]),
@@ -1069,7 +1253,10 @@ F   35  58.330000
 M   33  21.99
         """)
         # The # is part of the first name and should be deleted automatically.
-        test = np.genfromtxt(data, names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('M', 21, 72.1), ('F', 35, 58.33), ('M', 33, 21.99)],
                         dtype=[('gender', '|S1'), ('age', int), ('weight', float)])
         assert_equal(test, ctrl)
@@ -1080,14 +1267,20 @@ M   21  72.100000
 F   35  58.330000
 M   33  21.99
         """)
-        test = np.genfromtxt(data, names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test, ctrl)
 
     def test_autonames_and_usecols(self):
         # Tests names and usecols
         data = TextIO('A B C D\n aaaa 121 45 9.1')
-        test = np.ndfromtxt(data, usecols=('A', 'C', 'D'),
-                            names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'),
+                                names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = np.array(('aaaa', 45, 9.1),
                            dtype=[('A', '|S4'), ('C', int), ('D', float)])
         assert_equal(test, control)
@@ -1104,8 +1297,12 @@ M   33  21.99
     def test_converters_with_usecols_and_names(self):
         # Tests names and usecols
         data = TextIO('A B C D\n aaaa 121 45 9.1')
-        test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True,
-                            dtype=None, converters={'C': lambda s: 2 * int(s)})
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True,
+                                dtype=None,
+                                converters={'C': lambda s: 2 * int(s)})
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = np.array(('aaaa', 90, 9.1),
                            dtype=[('A', '|S4'), ('C', int), ('D', float)])
         assert_equal(test, control)
@@ -1225,6 +1422,18 @@ M   33  21.99
                            dtype=[('', '|S10'), ('', float)])
         assert_equal(test, control)
 
+    def test_utf8_userconverters_with_explicit_dtype(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'skip,skip,2001-01-01' + utf8 + b',1.0,skip')
+            test = np.genfromtxt(path, delimiter=",", names=None, dtype=float,
+                                 usecols=(2, 3), converters={2: np.unicode},
+                                 encoding='UTF-8')
+        control = np.array([('2001-01-01' + utf8.decode('UTF-8'), 1.)],
+                           dtype=[('', '|U11'), ('', float)])
+        assert_equal(test, control)
+
     def test_spacedelimiter(self):
         # Test space delimiter
         data = TextIO("1  2  3  4   5\n6  7  8  9  10")
@@ -1551,11 +1760,17 @@ M   33  21.99
         # Test autostrip
         data = "01/01/2003  , 1.3,   abcde"
         kwargs = dict(delimiter=",", dtype=None)
-        mtest = np.ndfromtxt(TextIO(data), **kwargs)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('01/01/2003  ', 1.3, '   abcde')],
                         dtype=[('f0', '|S12'), ('f1', float), ('f2', '|S8')])
         assert_equal(mtest, ctrl)
-        mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('01/01/2003', 1.3, 'abcde')],
                         dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')])
         assert_equal(mtest, ctrl)
@@ -1675,13 +1890,116 @@ M   33  21.99
 
     def test_comments_is_none(self):
         # Github issue 329 (None was previously being converted to 'None').
-        test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"),
-                             dtype=None, comments=None, delimiter=',')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test[1], b'testNonetherestofthedata')
-        test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"),
-                             dtype=None, comments=None, delimiter=',')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test[1], b' testNonetherestofthedata')
 
+    def test_latin1(self):
+        latin1 = b'\xf6\xfc\xf6'
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + latin1 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test[1, 0], b"test1")
+        assert_equal(test[1, 1], b"testNonethe" + latin1)
+        assert_equal(test[1, 2], b"test3")
+        test = np.genfromtxt(TextIO(s),
+                             dtype=None, comments=None, delimiter=',',
+                             encoding='latin1')
+        assert_equal(test[1, 0], u"test1")
+        assert_equal(test[1, 1], u"testNonethe" + latin1.decode('latin1'))
+        assert_equal(test[1, 2], u"test3")
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(b"0,testNonethe" + latin1),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test['f0'], 0)
+        assert_equal(test['f1'], b"testNonethe" + latin1)
+
+    def test_binary_decode_autodtype(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=None,
+                                       encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_utf8_byte_encoding(self):
+        utf8 = b"\xcf\x96"
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + utf8 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctl = np.array([
+                 [b'norm1', b'norm2', b'norm3'],
+                 [b'test1', b'testNonethe' + utf8, b'test3'],
+                 [b'norm1', b'norm2', b'norm3']])
+        assert_array_equal(test, ctl)
+
+    def test_utf8_file(self):
+        utf8 = b"\xcf\x96"
+        latin1 = b"\xf6\xfc\xf6"
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write((b"test1,testNonethe" + utf8 + b",test3\n") * 2)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            ctl = np.array([
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"],
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
+            # test a mixed dtype
+            with open(path, "wb") as f:
+                f.write(b"0,testNonethe" + utf8)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            assert_equal(test['f0'], 0)
+            assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8"))
+
+    @np.testing.dec.skipif(can_encode(b"\xcf\x96".decode('UTF-8')))
+    def test_utf8_file_nodtype_unicode(self):
+        # bytes encoding with non-latin1 -> unicode upcast
+        utf8 = b"\xcf\x96"
+        latin1 = b"\xf6\xfc\xf6"
+        with temppath() as path:
+            with io.open(path, "wt",
+                         encoding=locale.getpreferredencoding()) as f:
+                f.write(u"norm1,norm2,norm3\n")
+                f.write(u"norm1," + latin1.decode("latin1") + u",norm3\n")
+                f.write(u"test1,testNonethe" + utf8.decode("UTF-8") +
+                        u",test3\n")
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings('always', '',
+                                        np.VisibleDeprecationWarning)
+                test = np.genfromtxt(path, dtype=None, comments=None,
+                                     delimiter=',')
+                assert_(w[0].category is np.VisibleDeprecationWarning)
+            ctl = np.array([
+                     ["norm1", "norm2", "norm3"],
+                     ["norm1", latin1.decode("latin1"), "norm3"],
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
     def test_recfromtxt(self):
         #
         data = TextIO('A,B\n0,1\n2,3')
@@ -1793,11 +2111,7 @@ M   33  21.99
         # Test that we can load data from a filename as well as a file
         # object
         tgt = np.arange(6).reshape((2, 3))
-        if sys.version_info[0] >= 3:
-            # python 3k is known to fail for '\r'
-            linesep = ('\n', '\r\n')
-        else:
-            linesep = ('\n', '\r\n', '\r')
+        linesep = ('\n', '\r\n', '\r')
 
         for sep in linesep:
             data = '0 1 2' + sep + '3 4 5'
@@ -1807,6 +2121,22 @@ M   33  21.99
                 res = np.genfromtxt(name)
             assert_array_equal(res, tgt)
 
+    def test_gft_from_gzip(self):
+        # Test that we can load data from a gzipped file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            s = BytesIO()
+            with gzip.GzipFile(fileobj=s, mode='w') as g:
+                g.write(asbytes(data))
+
+            with temppath(suffix='.gz2') as name:
+                with open(name, 'w') as f:
+                    f.write(data)
+                assert_array_equal(np.genfromtxt(name), wanted)
+
     def test_gft_using_generator(self):
         # gft doesn't work with unicode.
         def count():
-- 
cgit v1.2.1


From 55273d236945aa5f4b6e01682dfef82384a7fd65 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sun, 19 Nov 2017 11:51:55 -0700
Subject: DOC: Add some docstrings and edit others.

Add docstrings for some of the support functions in _datasource and
npyio in order to aid future maintainers.

[ci skip]
---
 numpy/lib/_datasource.py | 44 ++++++++++++++++++++++++++++++++++++++++++--
 numpy/lib/npyio.py       | 42 +++++++++++++++++++++++++++---------------
 2 files changed, 69 insertions(+), 17 deletions(-)

(limited to 'numpy/lib')

diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index ad939df3f..1b5ecb34e 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -43,6 +43,18 @@ import io
 _open = open
 
 def _check_mode(mode, encoding, newline):
+    """Check mode and that encoding and newline are compatible.
+
+    Parameters
+    ----------
+    mode : str
+        File open mode.
+    encoding : str
+        File encoding.
+    newline : str
+        Newline for text files.
+
+    """
     if "t" in mode:
         if "b" in mode:
             raise ValueError("Invalid mode: %r" % (mode,))
@@ -52,8 +64,21 @@ def _check_mode(mode, encoding, newline):
         if newline is not None:
             raise ValueError("Argument 'newline' not supported in binary mode")
 
+
 def _python2_bz2open(fn, mode, encoding, newline):
-    """ wrapper to open bz2 in text mode """
+    """Wrapper to open bz2 in text mode.
+
+    Parameters
+    ----------
+    fn : str
+        File name
+    mode : {'r', 'w'}
+        File mode. Note that bz2 Text files are not supported.
+    encoding : str
+        Ignored, text bz2 files not supported in Python2.
+    newline : str
+        Ignored, text bz2 files not supported in Python2.
+    """
     import bz2
 
     _check_mode(mode, encoding, newline)
@@ -65,7 +90,21 @@ def _python2_bz2open(fn, mode, encoding, newline):
         return bz2.BZ2File(fn, mode)
 
 def _python2_gzipopen(fn, mode, encoding, newline):
-    """ wrapper to open gzip in text mode """
+    """ Wrapper to open gzip in text mode.
+
+    Parameters
+    ----------
+    fn : str, bytes, file
+        File path or opened file.
+    mode : str
+        File mode. The actual files are opened as binary, but will decoded
+        using the specified `encoding` and `newline`.
+    encoding : str
+        Encoding to be used when reading/writing as text.
+    newline : str
+        Newline to be used when reading/writing as text.
+
+    """
     import gzip
     # gzip is lacking read1 needed for TextIOWrapper
     class GzipWrap(gzip.GzipFile):
@@ -75,6 +114,7 @@ def _python2_gzipopen(fn, mode, encoding, newline):
     _check_mode(mode, encoding, newline)
 
     gz_mode = mode.replace("t", "")
+
     if isinstance(fn, (str, bytes)):
         binary_file = GzipWrap(fn, gz_mode)
     elif hasattr(fn, "read") or hasattr(fn, "write"):
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index fe2aa436b..6b65834ed 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -296,7 +296,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
         used in Python 3.
     encoding : str, optional
         What encoding to use when reading Python 2 strings. Only useful when
-        loading Python 2 generated pickled files on Python 3, which includes
+        loading Python 2 generated pickled files in Python 3, which includes
         npy/npz files containing object arrays. Values other than 'latin1',
         'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
         data. Default: 'ASCII'
@@ -819,13 +819,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         Legal values: 0 (default), 1 or 2.
 
         .. versionadded:: 1.6.0
-    encoding: string, optional
+    encoding : str, optional
         Encoding used to decode the inputfile. Does not apply to input streams.
         The special value 'bytes' enables backward compatibility workarounds
         that ensures you receive byte arrays as results if possible and passes
         latin1 encoded strings to converters. Override this value to receive
-        unicode arrays and pass strings as input to converters.
-        If set to None the system default is used.
+        unicode arrays and pass strings as input to converters.  If set to None
+        the system default is used. The default value is 'bytes'.
 
         .. versionadded:: 1.14.0
 
@@ -993,7 +993,17 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             return []
 
     def read_data(chunk_size):
-        # Parse each line, including the first
+        """Parse each line, including the first.
+
+        The file read, `fh`, is a global defined above.
+
+        Parameters
+        ----------
+        chunk_size : int
+            At most `chunk_size` lines are read at a time, with iteration
+            until all lines are read.
+
+        """
         X = []
         for i, line in enumerate(itertools.chain([first_line], fh)):
             vals = split_line(line)
@@ -1171,7 +1181,7 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         ``numpy.loadtxt``.
 
         .. versionadded:: 1.7.0
-    encoding: string, optional
+    encoding : str, optional
         Encoding used to encode the outputfile. Does not apply to output
         streams.
 
@@ -1251,7 +1261,9 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
     delimiter = asstr(delimiter)
 
     class WriteWrap(object):
-        """ convert to unicode in py2 or to bytes on bytestream inputs """
+        """Convert to unicode in py2 or to bytes on bytestream inputs.
+
+        """
         def __init__(self, fh, encoding):
             self.fh = fh
             self.encoding = encoding
@@ -1387,7 +1399,7 @@ def fromregex(file, regexp, dtype, encoding=None):
         Groups in the regular expression correspond to fields in the dtype.
     dtype : dtype or list of dtypes
         Dtype for the structured array.
-    encoding: string, optional
+    encoding : str, optional
         Encoding used to decode the inputfile. Does not apply to input streams.
 
         .. versionadded:: 1.14.0
@@ -1562,13 +1574,13 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         to read the entire file.
 
         .. versionadded:: 1.10.0
-    encoding: string, optional
-        Encoding used to decode the inputfile. Does not apply to input streams.
-        The special value 'bytes' enables backward compatibility workarounds
-        that ensures you receive byte arrays as results if possible and passes
-        latin1 encoded strings to converters. Override this value to receive
-        unicode arrays and pass strings as input to converters.
-        If set to None the system default is used.
+    encoding : str, optional
+        Encoding used to decode the inputfile. Does not apply when `fname` is
+        a file object.  The special value 'bytes' enables backward compatibility
+        workarounds that ensure that you receive byte arrays when possible
+        and passes latin1 encoded strings to converters. Override this value to
+        receive unicode arrays and pass strings as input to converters.  If set
+        to None the system default is used. The default value is 'bytes'.
 
         .. versionadded:: 1.14.0
 
-- 
cgit v1.2.1


From d9ca11117f37d48d07818a3aae3641c023454269 Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Sun, 19 Nov 2017 13:43:32 -0700
Subject: MAINT: Refactor some code in npyio.py.

---
 numpy/lib/_datasource.py         |  8 ++++-
 numpy/lib/_iotools.py            | 48 ++++++++++++++++++-------
 numpy/lib/npyio.py               | 78 ++++++++++++++++++++--------------------
 numpy/lib/tests/test__iotools.py |  2 --
 4 files changed, 81 insertions(+), 55 deletions(-)

(limited to 'numpy/lib')

diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index 1b5ecb34e..aec84865f 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -164,6 +164,7 @@ class _FileOpeners(object):
     def _load(self):
         if self._loaded:
             return
+
         try:
             import bz2
             if sys.version_info[0] >= 3:
@@ -172,6 +173,7 @@ class _FileOpeners(object):
                 self._file_openers[".bz2"] = _python2_bz2open
         except ImportError:
             pass
+
         try:
             import gzip
             if sys.version_info[0] >= 3:
@@ -180,12 +182,16 @@ class _FileOpeners(object):
                 self._file_openers[".gz"] = _python2_gzipopen
         except ImportError:
             pass
+
         try:
             import lzma
             self._file_openers[".xz"] = lzma.open
             self._file_openers[".lzma"] = lzma.open
-        except ImportError:
+        except (ImportError, AttributeError):
+            # There are incompatible backports of lzma that do not have the
+            # lzma.open attribute, so catch that as well as ImportError.
             pass
+
         self._loaded = True
 
     def keys(self):
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 8e091d42d..b7db77f32 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -18,7 +18,22 @@ else:
 
 
 def _decode_line(line, encoding=None):
-    """ decode bytes from binary input streams, default to latin1 """
+    """Decode bytes from binary input streams.
+
+    Defaults to decoding from 'latin1'. That differs from the behavior of
+    np.compat.asunicode that decodes from 'ascii'.
+
+    Parameters
+    ----------
+    line : str or bytes
+         Line to be decoded.
+
+    Returns
+    -------
+    decoded_line : unicode
+         Unicode in Python 2, a str (unicode) in Python 3.
+
+    """
     if type(line) is bytes:
         if encoding is None:
             line = line.decode('latin1')
@@ -510,8 +525,10 @@ class StringConverter(object):
         Value to return by default, that is, when the string to be
         converted is flagged as missing. If not given, `StringConverter`
         tries to supply a reasonable default value.
-    missing_values : sequence of str, optional
-        Sequence of strings indicating a missing value.
+    missing_values : {None, sequence of str}, optional
+        ``None`` or sequence of strings indicating a missing value. If ``None``
+        then missing values are indicated by empty entries. The default is
+        ``None``.
     locked : bool, optional
         Whether the StringConverter should be locked to prevent automatic
         upgrade or not. Default is False.
@@ -813,8 +830,9 @@ class StringConverter(object):
             A string representing a standard input value of the converter.
             This string is used to help defining a reasonable default
             value.
-        missing_values : sequence of str, optional
-            Sequence of strings indicating a missing value.
+        missing_values : {sequence of str, None}, optional
+            Sequence of strings indicating a missing value. If ``None``, then
+            the existing `missing_values` are cleared. The default is `''`.
         locked : bool, optional
             Whether the StringConverter should be locked to prevent
             automatic upgrade or not. Default is False.
@@ -828,6 +846,7 @@ class StringConverter(object):
         """
         self.func = func
         self._locked = locked
+
         # Don't reset the default to None if we can avoid it
         if default is not None:
             self.default = default
@@ -838,15 +857,18 @@ class StringConverter(object):
             except (TypeError, ValueError):
                 tester = None
             self.type = self._dtypeortype(self._getdtype(tester))
-        # Add the missing values to the existing set
-        if missing_values is not None:
-            if isinstance(missing_values, basestring):
-                self.missing_values.add(missing_values)
-            elif hasattr(missing_values, '__iter__'):
-                for val in missing_values:
-                    self.missing_values.add(val)
+
+        # Add the missing values to the existing set or clear it.
+        if missing_values is None:
+            # Clear all missing values even though the ctor initializes it to
+            # {''} when the argument is None.
+            self.missing_values = {}
         else:
-            self.missing_values = []
+            if not np.iterable(missing_values):
+                missing_values = [missing_values]
+            if not all(isinstance(v, basestring) for v in missing_values):
+                raise TypeError("missing_values must be strings or unicode")
+            self.missing_values.update(missing_values)
 
 
 def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs):
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6b65834ed..e4d827334 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -734,7 +734,7 @@ def _getconv(dtype):
     def floatconv(x):
         x.lower()
         if '0x' in x:
-            return float.fromhex(asstr(x))
+            return float.fromhex(x)
         return float(x)
 
     typ = dtype.type
@@ -782,13 +782,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         each row will be interpreted as an element of the array.  In this
         case, the number of columns used must match the number of fields in
         the data-type.
-    comments : str or sequence, optional
+    comments : str or sequence of str, optional
         The characters or list of characters used to indicate the start of a
-        comment;
-        default: '#'.
+        comment. For backwards compatibility, byte strings will be decoded as
+        'latin1'. The default is '#'.
     delimiter : str, optional
-        The string used to separate values.  By default, this is any
-        whitespace.
+        The string used to separate values. For backwards compatibility, byte
+        strings will be decoded as 'latin1'. The default is whitespace.
     converters : dict, optional
         A dictionary mapping column number to a function that will convert
         that column to a float.  E.g., if column 0 is a date string:
@@ -797,18 +797,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
     skiprows : int, optional
         Skip the first `skiprows` lines; default: 0.
-
     usecols : int or sequence, optional
         Which columns to read, with 0 being the first. For example,
         usecols = (1,4,5) will extract the 2nd, 5th and 6th columns.
         The default, None, results in all columns being read.
 
-        .. versionadded:: 1.11.0
-
-        Also when a single column has to be read it is possible to use
-        an integer instead of a tuple. E.g ``usecols = 3`` reads the
-        fourth column the same way as `usecols = (3,)`` would.
-
+        .. versionchanged:: 1.11.0
+            When a single column has to be read it is possible to use
+            an integer instead of a tuple. E.g ``usecols = 3`` reads the
+            fourth column the same way as `usecols = (3,)`` would.
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
         unpacked using ``x, y, z = loadtxt(...)``.  When used with a structured
@@ -877,12 +874,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     if comments is not None:
         if isinstance(comments, (basestring, bytes)):
             comments = [comments]
-
         comments = [_decode_line(x) for x in comments]
-
         # Compile regex for comments beforehand
         comments = (re.escape(comment) for comment in comments)
         regex_comments = re.compile('|'.join(comments))
+
+    if delimiter is not None:
+        delimiter = _decode_line(delimiter)
+
     user_converters = converters
 
     if encoding == 'bytes':
@@ -1071,7 +1070,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                     # Unused converter specified
                     continue
             if byte_converters:
-                # converters may use decode to workaround numpy's oldd behaviour,
+                # converters may use decode to workaround numpy's old behaviour,
                 # so encode the string again before passing to the user converter
                 def tobytes_first(x, conv):
                     if type(x) is bytes:
@@ -1181,9 +1180,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         ``numpy.loadtxt``.
 
         .. versionadded:: 1.7.0
-    encoding : str, optional
+    encoding : {None, str}, optional
         Encoding used to encode the outputfile. Does not apply to output
-        streams.
+        streams. If the encoding is something other than 'bytes' or 'latin1'
+        you will not be able to load the file in NumPy versions < 1.14. Default
+        is 'latin1'.
 
         .. versionadded:: 1.14.0
 
@@ -1908,7 +1909,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         if conv is bytes:
             user_conv = asbytes
         elif byte_converters:
-            # converters may use decode to workaround numpy's oldd behaviour,
+            # converters may use decode to workaround numpy's old behaviour,
             # so encode the string again before passing to the user converter
             def tobytes_first(x, conv):
                 if type(x) is bytes:
@@ -1927,7 +1928,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     user_converters.update(uc_update)
 
     # Fixme: possible error as following variable never used.
-    #miss_chars = [_.missing_values for _ in converters]
+    # miss_chars = [_.missing_values for _ in converters]
 
     # Initialize the output lists ...
     # ... rows
@@ -2041,39 +2042,38 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         strcolidx = [i for (i, v) in enumerate(column_types)
                      if v == np.unicode_]
 
-        typestr = 'U'
+        type_str = np.unicode_
         if byte_converters and strcolidx:
             # convert strings back to bytes for backward compatibility
             warnings.warn(
-                "Reading strings without specifying the encoding argument is "
-                "deprecated. Set encoding, use None for the system default.",
+                "Reading unicode strings without specifying the encoding "
+                "argument is deprecated. Set the encoding, use None for the "
+                "system default.",
                 np.VisibleDeprecationWarning, stacklevel=2)
+            def encode_unicode_cols(row_tup):
+                row = list(row_tup)
+                for i in strcolidx:
+                    row[i] = row[i].encode('latin1')
+                return tuple(row)
+
             try:
-                for j in range(len(data)):
-                    row = list(data[j])
-                    for i in strcolidx:
-                        row[i] = row[i].encode('latin1')
-                    data[j] = tuple(row)
-                typestr = 'S'
+                data = [encode_unicode_cols(r) for r in data]
+                type_str = np.bytes_
             except UnicodeEncodeError:
-                # we must use unicode, revert encoding
-                for k in range(0, j + 1):
-                    row = list(data[k])
-                    for i in strcolidx:
-                        if isinstance(row[i], bytes):
-                            row[i] = row[i].decode('latin1')
-                    data[k] = tuple(row)
+                pass
+
 
         # ... and take the largest number of chars.
         for i in strcolidx:
-            column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data))
+            max_line_length = max(len(row[i]) for row in data)
+            column_types[i] = np.dtype((type_str, max_line_length))
         #
         if names is None:
             # If the dtype is uniform, don't define names, else use ''
             base = set([c.type for c in converters if c._checked])
             if len(base) == 1:
                 if strcolidx:
-                    (ddtype, mdtype) = (typestr, bool)
+                    (ddtype, mdtype) = (type_str, bool)
                 else:
                     (ddtype, mdtype) = (list(base)[0], bool)
             else:
@@ -2148,7 +2148,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     # Try to take care of the missing data we missed
     names = output.dtype.names
     if usemask and names:
-        for (name, conv) in zip(names or (), converters):
+        for (name, conv) in zip(names, converters):
             missing_values = [conv(_) for _ in conv.missing_values
                               if _ != '']
             for mval in missing_values:
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 990ee126d..b25b42f8c 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -133,8 +133,6 @@ class TestNameValidator(object):
 
 
 def _bytes_to_date(s):
-    if type(s) == bytes:
-        s = s.decode("latin1")
     return date(*time.strptime(s, "%Y-%m-%d")[:3])
 
 
-- 
cgit v1.2.1


From 1d97b3aafdca2722bbe2f0c10a96544121c8f78b Mon Sep 17 00:00:00 2001
From: Charles Harris <charlesr.harris@gmail.com>
Date: Tue, 21 Nov 2017 10:10:26 -0700
Subject: MAINT: Various minor code cleanups.

Minor cleanups of old code to reflect more modern usage.
---
 numpy/lib/_datasource.py         |  15 +++++
 numpy/lib/_iotools.py            |   4 +-
 numpy/lib/tests/test__iotools.py |  20 +++++--
 numpy/lib/tests/test_io.py       | 115 ++++++++++++++++++++-------------------
 4 files changed, 90 insertions(+), 64 deletions(-)

(limited to 'numpy/lib')

diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index aec84865f..6f1295f09 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -238,6 +238,11 @@ def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
         Path to the directory where the source file gets downloaded to for
         use.  If `destpath` is None, a temporary directory will be created.
         The default path is the current directory.
+    encoding : {None, str}, optional
+        Open text file with given encoding. The default encoding will be
+        what `io.open` uses.
+    newline : {None, str}, optional
+        Newline to use when reading text file.
 
     Returns
     -------
@@ -577,6 +582,11 @@ class DataSource (object):
             Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
             'a' to append. Available modes depend on the type of object
             specified by `path`. Default is 'r'.
+        encoding : {None, str}, optional
+            Open text file with given encoding. The default encoding will be
+            what `io.open` uses.
+        newline : {None, str}, optional
+            Newline to use when reading text file.
 
         Returns
         -------
@@ -741,6 +751,11 @@ class Repository (DataSource):
             Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
             'a' to append. Available modes depend on the type of object
             specified by `path`. Default is 'r'.
+        encoding : {None, str}, optional
+            Open text file with given encoding. The default encoding will be
+            what `io.open` uses.
+        newline : {None, str}, optional
+            Newline to use when reading text file.
 
         Returns
         -------
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index b7db77f32..27143e5c6 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -861,8 +861,8 @@ class StringConverter(object):
         # Add the missing values to the existing set or clear it.
         if missing_values is None:
             # Clear all missing values even though the ctor initializes it to
-            # {''} when the argument is None.
-            self.missing_values = {}
+            # set(['']) when the argument is None.
+            self.missing_values = set()
         else:
             if not np.iterable(missing_values):
                 missing_values = [missing_values]
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index b25b42f8c..54fac8da4 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -12,6 +12,7 @@ from numpy.lib._iotools import (
     LineSplitter, NameValidator, StringConverter,
     has_nested_fields, easy_dtype, flatten_dtype
     )
+from numpy.compat import unicode
 
 
 class TestLineSplitter(object):
@@ -155,10 +156,10 @@ class TestStringConverter(object):
         assert_equal(converter.upgrade('0'), 0)
         assert_equal(converter._status, 1)
 
-        # On systems where integer defaults to 32-bit, the statuses will be
+        # On systems where long defaults to 32-bit, the statuses will be
         # offset by one, so we check for this here.
         import numpy.core.numeric as nx
-        status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize)
+        status_offset = int(nx.dtype(nx.int_).itemsize < nx.dtype(nx.int64).itemsize)
 
         # test int > 2**32
         assert_equal(converter.upgrade('17179869184'), 17179869184)
@@ -172,9 +173,15 @@ class TestStringConverter(object):
         assert_equal(converter.upgrade('0j'), complex('0j'))
         assert_equal(converter._status, 3 + status_offset)
 
-        # test str TODO
-        #assert_equal(converter.upgrade(b'a'), b'a')
-        #assert_equal(converter._status, len(converter._mapper) - 1)
+        # test str
+        # note that the longdouble type has been skipped, so the
+        # _status increases by 2. Everything should succeed with
+        # unicode conversion (5).
+        for s in ['a', u'a', b'a']:
+            res = converter.upgrade(s)
+            assert_(type(res) is unicode)
+            assert_equal(res, u'a')
+            assert_equal(converter._status, 5 + status_offset)
 
     def test_missing(self):
         "Tests the use of missing values."
@@ -204,8 +211,9 @@ class TestStringConverter(object):
 
     def test_string_to_object(self):
         "Make sure that string-to-object functions are properly recognized"
+        old_mapper = StringConverter._mapper[:]  # copy of list
         conv = StringConverter(_bytes_to_date)
-        assert_equal(conv._mapper[-3][0](0), 0j)
+        assert_equal(conv._mapper, old_mapper)
         assert_(hasattr(conv, 'default'))
 
     def test_keep_default(self):
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 35c37c7be..75a8e4968 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -20,19 +20,11 @@ from numpy.lib._iotools import ConverterError, ConversionWarning
 from numpy.compat import asbytes, bytes, unicode, Path
 from numpy.ma.testutils import assert_equal
 from numpy.testing import (
-    run_module_suite, assert_warns, assert_,
+    run_module_suite, assert_warns, assert_, SkipTest,
     assert_raises_regex, assert_raises, assert_allclose,
     assert_array_equal, temppath, tempdir, dec, IS_PYPY, suppress_warnings
 )
 
-def can_encode(v):
-    """ check if bytes can be decoded with default encoding """
-    try:
-        v.encode(locale.getpreferredencoding())
-        return False # no skipping
-    except UnicodeEncodeError:
-        return True
-
 
 class TextIO(BytesIO):
     """Helper IO class.
@@ -164,7 +156,7 @@ class RoundtripTest(object):
         a = np.array([1, 2, 3, 4], int)
         self.roundtrip(a)
 
-    @np.testing.dec.knownfailureif(sys.platform == 'win32', "Fail on Win32")
+    @dec.knownfailureif(sys.platform == 'win32', "Fail on Win32")
     def test_mmap(self):
         a = np.array([[1, 2.5], [4, 7.3]])
         self.roundtrip(a, file_on_disk=True, load_kwds={'mmap_mode': 'r'})
@@ -208,8 +200,8 @@ class TestSavezLoad(RoundtripTest):
                 self.arr_reloaded.fid.close()
                 os.remove(self.arr_reloaded.fid.name)
 
-    @np.testing.dec.skipif(not IS_64BIT, "Works only with 64bit systems")
-    @np.testing.dec.slow
+    @dec.skipif(not IS_64BIT, "Works only with 64bit systems")
+    @dec.slow
     def test_big_arrays(self):
         L = (1 << 31) + 100000
         a = np.empty(L, dtype=np.uint8)
@@ -285,7 +277,7 @@ class TestSavezLoad(RoundtripTest):
                 fp.seek(0)
                 assert_(not fp.closed)
 
-    @np.testing.dec.skipif(IS_PYPY, "context manager required on PyPy")
+    @dec.skipif(IS_PYPY, "context manager required on PyPy")
     def test_closing_fid(self):
         # Test that issue #1517 (too many opened files) remains closed
         # It might be a "weak" test since failed to get triggered on
@@ -351,8 +343,8 @@ class TestSaveTxt(object):
 
     def test_0D_3D(self):
         c = BytesIO()
-        assert_raises(ValueError, np.savetxt, c, np.array(1)) 
-        assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]])) 
+        assert_raises(ValueError, np.savetxt, c, np.array(1))
+        assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]]))
 
 
     def test_record(self):
@@ -530,7 +522,7 @@ class TestSaveTxt(object):
         assert_equal(s.read(), utf8 + '\n')
 
 
-class LoadTxtBase:
+class LoadTxtBase(object):
     def check_compressed(self, fopen, suffixes):
         # Test that we can load data from a compressed file
         wanted = np.arange(6).reshape((2, 3))
@@ -541,23 +533,22 @@ class LoadTxtBase:
                 with temppath(suffix=suffix) as name:
                     with fopen(name, mode='wt', encoding='UTF-32-LE') as f:
                         f.write(data)
-                    res = getattr(np, self.loadfunc)(name,
-                                                     encoding='UTF-32-LE')
+                    res = self.loadfunc(name, encoding='UTF-32-LE')
                     assert_array_equal(res, wanted)
-                    res = getattr(np, self.loadfunc)(
-                                 fopen(name, "rt", encoding='UTF-32-LE'))
+                    with fopen(name, "rt",  encoding='UTF-32-LE') as f:
+                        res = self.loadfunc(f)
                     assert_array_equal(res, wanted)
 
     # Python2 .open does not support encoding
-    @np.testing.dec.skipif(MAJVER == 2)
+    @dec.skipif(MAJVER == 2)
     def test_compressed_gzip(self):
         self.check_compressed(gzip.open, ('.gz',))
 
-    @np.testing.dec.skipif(MAJVER == 2 or not HAS_BZ2)
+    @dec.skipif(MAJVER == 2 or not HAS_BZ2)
     def test_compressed_gzip(self):
         self.check_compressed(bz2.open, ('.bz2',))
 
-    @np.testing.dec.skipif(MAJVER == 2 or not HAS_LZMA)
+    @dec.skipif(MAJVER == 2 or not HAS_LZMA)
     def test_compressed_gzip(self):
         self.check_compressed(lzma.open, ('.xz', '.lzma'))
 
@@ -565,7 +556,7 @@ class LoadTxtBase:
         with temppath() as path:
             with open(path, "wb") as f:
                 f.write('0.\n1.\n2.'.encode("UTF-16"))
-            x = getattr(np, self.loadfunc)(path, encoding="UTF-16")
+            x = self.loadfunc(path, encoding="UTF-16")
             assert_array_equal(x, [0., 1., 2.])
 
     def test_stringload(self):
@@ -574,13 +565,12 @@ class LoadTxtBase:
         with temppath() as path:
             with open(path, "wb") as f:
                 f.write(nonascii.encode("UTF-16"))
-            x = getattr(np, self.loadfunc)(path, encoding="UTF-16", dtype=np.unicode)
+            x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode)
             assert_array_equal(x, nonascii)
 
     def test_binary_decode(self):
         utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
-        v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=np.unicode,
-                                       encoding='UTF-16')
+        v = self.loadfunc(BytesIO(utf16), dtype=np.unicode, encoding='UTF-16')
         assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
 
     def test_converters_decode(self):
@@ -588,8 +578,8 @@ class LoadTxtBase:
         c = TextIO()
         c.write(b'\xcf\x96')
         c.seek(0)
-        x = getattr(np, self.loadfunc)(c, dtype=np.unicode,
-                       converters={0: lambda x: x.decode('UTF-8')})
+        x = self.loadfunc(c, dtype=np.unicode,
+                          converters={0: lambda x: x.decode('UTF-8')})
         a = np.array([b'\xcf\x96'.decode('UTF-8')])
         assert_array_equal(x, a)
 
@@ -599,15 +589,16 @@ class LoadTxtBase:
         with temppath() as path:
             with io.open(path, 'wt', encoding='UTF-8') as f:
                 f.write(utf8)
-            x = getattr(np, self.loadfunc)(path, dtype=np.unicode,
-                                           converters={0: lambda x: x + 't'},
-                                           encoding='UTF-8')
+            x = self.loadfunc(path, dtype=np.unicode,
+                              converters={0: lambda x: x + 't'},
+                              encoding='UTF-8')
             a = np.array([utf8 + 't'])
             assert_array_equal(x, a)
 
 
 class TestLoadTxt(LoadTxtBase):
-    loadfunc = 'loadtxt'
+    loadfunc = staticmethod(np.loadtxt)
+
     def setUp(self):
         # lower chunksize for testing
         self.orig_chunk = np.lib.npyio._loadtxt_chunksize
@@ -1016,7 +1007,7 @@ class TestLoadTxt(LoadTxtBase):
         dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)])
         np.loadtxt(c, delimiter=',', dtype=dt, comments=None)  # Should succeed
 
-    @np.testing.dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968')
+    @dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968')
     def test_binary_load(self):
         butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\
                 b"20,2,3,\xc3\x95scar\n\r"
@@ -1087,7 +1078,8 @@ class Testfromregex(object):
 
 
 class TestFromTxt(LoadTxtBase):
-    loadfunc = 'genfromtxt'
+    loadfunc = staticmethod(np.genfromtxt)
+
     def test_record(self):
         # Test w/ explicit dtype
         data = TextIO('1 2\n3 4')
@@ -1933,8 +1925,7 @@ M   33  21.99
 
     def test_binary_decode_autodtype(self):
         utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
-        v = getattr(np, self.loadfunc)(BytesIO(utf16), dtype=None,
-                                       encoding='UTF-16')
+        v = self.loadfunc(BytesIO(utf16), dtype=None, encoding='UTF-16')
         assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
 
     def test_utf8_byte_encoding(self):
@@ -1975,28 +1966,40 @@ M   33  21.99
             assert_equal(test['f0'], 0)
             assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8"))
 
-    @np.testing.dec.skipif(can_encode(b"\xcf\x96".decode('UTF-8')))
+
     def test_utf8_file_nodtype_unicode(self):
         # bytes encoding with non-latin1 -> unicode upcast
-        utf8 = b"\xcf\x96"
-        latin1 = b"\xf6\xfc\xf6"
+        utf8 = u'\u03d6'
+        latin1 = u'\xf6\xfc\xf6'
+
+        # skip test if cannot encode utf8 test string with preferred
+        # encoding. The preferred encoding is assumed to be the default
+        # encoding of io.open. Will need to change this for PyTest, maybe
+        # using pytest.mark.xfail(raises=***).
+        try:
+            import locale
+            encoding = locale.getpreferredencoding()
+            utf8.encode(encoding)
+        except (UnicodeError, ImportError):
+            raise SkipTest('Skipping test_utf8_file_nodtype_unicode, '
+                           'unable to encode utf8 in preferred encoding') 
+
         with temppath() as path:
-            with io.open(path, "wt",
-                         encoding=locale.getpreferredencoding()) as f:
+            with io.open(path, "wt") as f:
                 f.write(u"norm1,norm2,norm3\n")
-                f.write(u"norm1," + latin1.decode("latin1") + u",norm3\n")
-                f.write(u"test1,testNonethe" + utf8.decode("UTF-8") +
-                        u",test3\n")
+                f.write(u"norm1," + latin1 + u",norm3\n")
+                f.write(u"test1,testNonethe" + utf8 + u",test3\n")
             with warnings.catch_warnings(record=True) as w:
                 warnings.filterwarnings('always', '',
                                         np.VisibleDeprecationWarning)
                 test = np.genfromtxt(path, dtype=None, comments=None,
                                      delimiter=',')
+                # Check for warning when encoding not specified.
                 assert_(w[0].category is np.VisibleDeprecationWarning)
             ctl = np.array([
                      ["norm1", "norm2", "norm3"],
-                     ["norm1", latin1.decode("latin1"), "norm3"],
-                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
+                     ["norm1", latin1, "norm3"],
+                     ["test1", "testNonethe" + utf8, "test3"]],
                      dtype=np.unicode)
             assert_array_equal(test, ctl)
 
@@ -2174,7 +2177,7 @@ M   33  21.99
 
 class TestPathUsage(object):
     # Test that pathlib.Path can be used
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_loadtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -2183,7 +2186,7 @@ class TestPathUsage(object):
             x = np.loadtxt(path)
             assert_array_equal(x, a)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_save_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npy') as path:
@@ -2193,7 +2196,7 @@ class TestPathUsage(object):
             data = np.load(path)
             assert_array_equal(data, a)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_savez_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npz') as path:
@@ -2202,7 +2205,7 @@ class TestPathUsage(object):
             with np.load(path) as data:
                 assert_array_equal(data['lab'], 'place holder')
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_savez_compressed_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npz') as path:
@@ -2212,7 +2215,7 @@ class TestPathUsage(object):
             assert_array_equal(data['lab'], 'place holder')
             data.close()
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_genfromtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -2221,7 +2224,7 @@ class TestPathUsage(object):
             data = np.genfromtxt(path)
             assert_array_equal(a, data)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_ndfromtxt(self):
         # Test outputing a standard ndarray
         with temppath(suffix='.txt') as path:
@@ -2233,7 +2236,7 @@ class TestPathUsage(object):
             test = np.ndfromtxt(path, dtype=int)
             assert_array_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_mafromtxt(self):
         # From `test_fancy_dtype_alt` above
         with temppath(suffix='.txt') as path:
@@ -2245,7 +2248,7 @@ class TestPathUsage(object):
             control = ma.array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)])
             assert_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_recfromtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -2259,7 +2262,7 @@ class TestPathUsage(object):
             assert_(isinstance(test, np.recarray))
             assert_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_recfromcsv(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
-- 
cgit v1.2.1