Merge pull request #10054 from charris/gh-4208

ENH: Add encoding option to numpy text IO.
author: Charles Harris <charlesr.harris@gmail.com> 2017-11-26 10:30:17 -0700
committer: GitHub <noreply@github.com> 2017-11-26 10:30:17 -0700
commit: 8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4 (patch)
tree: e8ea159c9d9d3eed3bedc0ea702732119dc0b5c4 /numpy
parent: b13c70551f1ebc5222e9b11da61e2b886fbbef5e (diff)
parent: 1d97b3aafdca2722bbe2f0c10a96544121c8f78b (diff)
download: numpy-8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4.tar.gz
5 files changed, 919 insertions, 284 deletions
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index 3affc5195..6f1295f09 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -15,7 +15,7 @@ DataSource files can originate locally or remotely:
 - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
 
 DataSource files can also be compressed or uncompressed.  Currently only
-gzip and bz2 are supported.
+gzip, bz2 and xz are supported.
 
 Example::
 
@@ -38,13 +38,99 @@ from __future__ import division, absolute_import, print_function
 import os
 import sys
 import shutil
+import io
 
 _open = open
 
+def _check_mode(mode, encoding, newline):
+    """Check mode and that encoding and newline are compatible.
+
+    Parameters
+    ----------
+    mode : str
+        File open mode.
+    encoding : str
+        File encoding.
+    newline : str
+        Newline for text files.
+
+    """
+    if "t" in mode:
+        if "b" in mode:
+            raise ValueError("Invalid mode: %r" % (mode,))
+    else:
+        if encoding is not None:
+            raise ValueError("Argument 'encoding' not supported in binary mode")
+        if newline is not None:
+            raise ValueError("Argument 'newline' not supported in binary mode")
+
+
+def _python2_bz2open(fn, mode, encoding, newline):
+    """Wrapper to open bz2 in text mode.
+
+    Parameters
+    ----------
+    fn : str
+        File name
+    mode : {'r', 'w'}
+        File mode. Note that bz2 Text files are not supported.
+    encoding : str
+        Ignored, text bz2 files not supported in Python2.
+    newline : str
+        Ignored, text bz2 files not supported in Python2.
+    """
+    import bz2
+
+    _check_mode(mode, encoding, newline)
+
+    if "t" in mode:
+        # BZ2File is missing necessary functions for TextIOWrapper
+        raise ValueError("bz2 text files not supported in python2")
+    else:
+        return bz2.BZ2File(fn, mode)
+
+def _python2_gzipopen(fn, mode, encoding, newline):
+    """ Wrapper to open gzip in text mode.
+
+    Parameters
+    ----------
+    fn : str, bytes, file
+        File path or opened file.
+    mode : str
+        File mode. The actual files are opened as binary, but will decoded
+        using the specified `encoding` and `newline`.
+    encoding : str
+        Encoding to be used when reading/writing as text.
+    newline : str
+        Newline to be used when reading/writing as text.
+
+    """
+    import gzip
+    # gzip is lacking read1 needed for TextIOWrapper
+    class GzipWrap(gzip.GzipFile):
+        def read1(self, n):
+            return self.read(n)
+
+    _check_mode(mode, encoding, newline)
+
+    gz_mode = mode.replace("t", "")
+
+    if isinstance(fn, (str, bytes)):
+        binary_file = GzipWrap(fn, gz_mode)
+    elif hasattr(fn, "read") or hasattr(fn, "write"):
+        binary_file = GzipWrap(None, gz_mode, fileobj=fn)
+    else:
+        raise TypeError("filename must be a str or bytes object, or a file")
+
+    if "t" in mode:
+        return io.TextIOWrapper(binary_file, encoding, newline=newline)
+    else:
+        return binary_file
+
 
 # Using a class instead of a module-level dictionary
 # to reduce the initial 'import numpy' overhead by
-# deferring the import of bz2 and gzip until needed
+# deferring the import of lzma, bz2 and gzip until needed
 
 # TODO: .zip support, .tar support?
 class _FileOpeners(object):
@@ -55,7 +141,7 @@ class _FileOpeners(object):
     supported file format. Attribute lookup is implemented in such a way
     that an instance of `_FileOpeners` itself can be indexed with the keys
     of that dictionary. Currently uncompressed files as well as files
-    compressed with ``gzip`` or ``bz2`` compression are supported.
+    compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
 
     Notes
     -----
@@ -65,7 +151,7 @@ class _FileOpeners(object):
     Examples
     --------
     >>> np.lib._datasource._file_openers.keys()
-    [None, '.bz2', '.gz']
+    [None, '.bz2', '.gz', '.xz', '.lzma']
     >>> np.lib._datasource._file_openers['.gz'] is gzip.open
     True
 
@@ -73,21 +159,39 @@ class _FileOpeners(object):
 
     def __init__(self):
         self._loaded = False
-        self._file_openers = {None: open}
+        self._file_openers = {None: io.open}
 
     def _load(self):
         if self._loaded:
             return
+
         try:
             import bz2
-            self._file_openers[".bz2"] = bz2.BZ2File
+            if sys.version_info[0] >= 3:
+                self._file_openers[".bz2"] = bz2.open
+            else:
+                self._file_openers[".bz2"] = _python2_bz2open
         except ImportError:
             pass
+
         try:
             import gzip
-            self._file_openers[".gz"] = gzip.open
+            if sys.version_info[0] >= 3:
+                self._file_openers[".gz"] = gzip.open
+            else:
+                self._file_openers[".gz"] = _python2_gzipopen
         except ImportError:
             pass
+
+        try:
+            import lzma
+            self._file_openers[".xz"] = lzma.open
+            self._file_openers[".lzma"] = lzma.open
+        except (ImportError, AttributeError):
+            # There are incompatible backports of lzma that do not have the
+            # lzma.open attribute, so catch that as well as ImportError.
+            pass
+
         self._loaded = True
 
     def keys(self):
@@ -102,7 +206,7 @@ class _FileOpeners(object):
         -------
         keys : list
             The keys are None for uncompressed files and the file extension
-            strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression
+            strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
             methods.
 
         """
@@ -115,7 +219,7 @@ class _FileOpeners(object):
 
 _file_openers = _FileOpeners()
 
-def open(path, mode='r', destpath=os.curdir):
+def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
     """
     Open `path` with `mode` and return the file object.
 
@@ -134,6 +238,11 @@ def open(path, mode='r', destpath=os.curdir):
         Path to the directory where the source file gets downloaded to for
         use.  If `destpath` is None, a temporary directory will be created.
         The default path is the current directory.
+    encoding : {None, str}, optional
+        Open text file with given encoding. The default encoding will be
+        what `io.open` uses.
+    newline : {None, str}, optional
+        Newline to use when reading text file.
 
     Returns
     -------
@@ -148,7 +257,7 @@ def open(path, mode='r', destpath=os.curdir):
     """
 
     ds = DataSource(destpath)
-    return ds.open(path, mode)
+    return ds.open(path, mode, encoding=encoding, newline=newline)
 
 
 class DataSource (object):
@@ -458,7 +567,7 @@ class DataSource (object):
                 return False
         return False
 
-    def open(self, path, mode='r'):
+    def open(self, path, mode='r', encoding=None, newline=None):
         """
         Open and return file-like object.
 
@@ -473,6 +582,11 @@ class DataSource (object):
             Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
             'a' to append. Available modes depend on the type of object
             specified by `path`. Default is 'r'.
+        encoding : {None, str}, optional
+            Open text file with given encoding. The default encoding will be
+            what `io.open` uses.
+        newline : {None, str}, optional
+            Newline to use when reading text file.
 
         Returns
         -------
@@ -496,7 +610,8 @@ class DataSource (object):
             _fname, ext = self._splitzipext(found)
             if ext == 'bz2':
                 mode.replace("+", "")
-            return _file_openers[ext](found, mode=mode)
+            return _file_openers[ext](found, mode=mode,
+                                      encoding=encoding, newline=newline)
         else:
             raise IOError("%s not found." % path)
 
@@ -619,7 +734,7 @@ class Repository (DataSource):
         """
         return DataSource.exists(self, self._fullpath(path))
 
-    def open(self, path, mode='r'):
+    def open(self, path, mode='r', encoding=None, newline=None):
         """
         Open and return file-like object prepending Repository base URL.
 
@@ -636,6 +751,11 @@ class Repository (DataSource):
             Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
             'a' to append. Available modes depend on the type of object
             specified by `path`. Default is 'r'.
+        encoding : {None, str}, optional
+            Open text file with given encoding. The default encoding will be
+            what `io.open` uses.
+        newline : {None, str}, optional
+            Newline to use when reading text file.
 
         Returns
         -------
@@ -643,7 +763,8 @@ class Repository (DataSource):
             File object.
 
         """
-        return DataSource.open(self, self._fullpath(path), mode)
+        return DataSource.open(self, self._fullpath(path), mode,
+                               encoding=encoding, newline=newline)
 
     def listdir(self):
         """
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 1874c2e97..27143e5c6 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
 import sys
 import numpy as np
 import numpy.core.numeric as nx
-from numpy.compat import asbytes, bytes, asbytes_nested, basestring
+from numpy.compat import asbytes, asunicode, bytes, asbytes_nested, basestring
 
 if sys.version_info[0] >= 3:
     from builtins import bool, int, float, complex, object, str
@@ -17,15 +17,30 @@ else:
     from __builtin__ import bool, int, float, complex, object, unicode, str
 
 
-if sys.version_info[0] >= 3:
-    def _bytes_to_complex(s):
-        return complex(s.decode('ascii'))
+def _decode_line(line, encoding=None):
+    """Decode bytes from binary input streams.
 
-    def _bytes_to_name(s):
-        return s.decode('ascii')
-else:
-    _bytes_to_complex = complex
-    _bytes_to_name = str
+    Defaults to decoding from 'latin1'. That differs from the behavior of
+    np.compat.asunicode that decodes from 'ascii'.
+
+    Parameters
+    ----------
+    line : str or bytes
+         Line to be decoded.
+
+    Returns
+    -------
+    decoded_line : unicode
+         Unicode in Python 2, a str (unicode) in Python 3.
+
+    """
+    if type(line) is bytes:
+        if encoding is None:
+            line = line.decode('latin1')
+        else:
+            line = line.decode(encoding)
+
+    return line
 
 
 def _is_string_like(obj):
@@ -189,12 +204,10 @@ class LineSplitter(object):
         return lambda input: [_.strip() for _ in method(input)]
     #
 
-    def __init__(self, delimiter=None, comments=b'#', autostrip=True):
+    def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
         self.comments = comments
         # Delimiter is a character
-        if isinstance(delimiter, unicode):
-            delimiter = delimiter.encode('ascii')
-        if (delimiter is None) or _is_bytes_like(delimiter):
+        if (delimiter is None) or isinstance(delimiter, basestring):
             delimiter = delimiter or None
             _handyman = self._delimited_splitter
         # Delimiter is a list of field widths
@@ -213,12 +226,14 @@ class LineSplitter(object):
             self._handyman = self.autostrip(_handyman)
         else:
             self._handyman = _handyman
+        self.encoding = encoding
     #
 
     def _delimited_splitter(self, line):
+        """Chop off comments, strip, and split at delimiter. """
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(b" \r\n")
+        line = line.strip(" \r\n")
         if not line:
             return []
         return line.split(self.delimiter)
@@ -227,7 +242,7 @@ class LineSplitter(object):
     def _fixedwidth_splitter(self, line):
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(b"\r\n")
+        line = line.strip("\r\n")
         if not line:
             return []
         fixed = self.delimiter
@@ -245,7 +260,7 @@ class LineSplitter(object):
     #
 
     def __call__(self, line):
-        return self._handyman(line)
+        return self._handyman(_decode_line(line, self.encoding))
 
 
 class NameValidator(object):
@@ -434,9 +449,9 @@ def str2bool(value):
 
     """
     value = value.upper()
-    if value == b'TRUE':
+    if value == 'TRUE':
         return True
-    elif value == b'FALSE':
+    elif value == 'FALSE':
         return False
     else:
         raise ValueError("Invalid boolean")
@@ -510,8 +525,10 @@ class StringConverter(object):
         Value to return by default, that is, when the string to be
         converted is flagged as missing. If not given, `StringConverter`
         tries to supply a reasonable default value.
-    missing_values : sequence of str, optional
-        Sequence of strings indicating a missing value.
+    missing_values : {None, sequence of str}, optional
+        ``None`` or sequence of strings indicating a missing value. If ``None``
+        then missing values are indicated by empty entries. The default is
+        ``None``.
     locked : bool, optional
         Whether the StringConverter should be locked to prevent automatic
         upgrade or not. Default is False.
@@ -527,9 +544,10 @@ class StringConverter(object):
         _mapper.append((nx.int64, int, -1))
 
     _mapper.extend([(nx.floating, float, nx.nan),
-                    (nx.complexfloating, _bytes_to_complex, nx.nan + 0j),
+                    (nx.complexfloating, complex, nx.nan + 0j),
                     (nx.longdouble, nx.longdouble, nx.nan),
-                    (nx.string_, bytes, b'???')])
+                    (nx.unicode_, asunicode, '???'),
+                    (nx.string_, asbytes, '???')])
 
     (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
 
@@ -601,11 +619,6 @@ class StringConverter(object):
 
     def __init__(self, dtype_or_func=None, default=None, missing_values=None,
                  locked=False):
-        # Convert unicode (for Py3)
-        if isinstance(missing_values, unicode):
-            missing_values = asbytes(missing_values)
-        elif isinstance(missing_values, (list, tuple)):
-            missing_values = asbytes_nested(missing_values)
         # Defines a lock for upgrade
         self._locked = bool(locked)
         # No input dtype: minimal initialization
@@ -631,7 +644,7 @@ class StringConverter(object):
                 # None
                 if default is None:
                     try:
-                        default = self.func(b'0')
+                        default = self.func('0')
                     except ValueError:
                         default = None
                 dtype = self._getdtype(default)
@@ -676,11 +689,11 @@ class StringConverter(object):
                     self.func = lambda x: int(float(x))
         # Store the list of strings corresponding to missing values.
         if missing_values is None:
-            self.missing_values = set([b''])
+            self.missing_values = set([''])
         else:
-            if isinstance(missing_values, bytes):
-                missing_values = missing_values.split(b",")
-            self.missing_values = set(list(missing_values) + [b''])
+            if isinstance(missing_values, basestring):
+                missing_values = missing_values.split(",")
+            self.missing_values = set(list(missing_values) + [''])
         #
         self._callingfunction = self._strict_call
         self.type = self._dtypeortype(dtype)
@@ -801,7 +814,7 @@ class StringConverter(object):
             self.iterupgrade(value)
 
     def update(self, func, default=None, testing_value=None,
-               missing_values=b'', locked=False):
+               missing_values='', locked=False):
         """
         Set StringConverter attributes directly.
 
@@ -817,8 +830,9 @@ class StringConverter(object):
             A string representing a standard input value of the converter.
             This string is used to help defining a reasonable default
             value.
-        missing_values : sequence of str, optional
-            Sequence of strings indicating a missing value.
+        missing_values : {sequence of str, None}, optional
+            Sequence of strings indicating a missing value. If ``None``, then
+            the existing `missing_values` are cleared. The default is `''`.
         locked : bool, optional
             Whether the StringConverter should be locked to prevent
             automatic upgrade or not. Default is False.
@@ -832,25 +846,29 @@ class StringConverter(object):
         """
         self.func = func
         self._locked = locked
+
         # Don't reset the default to None if we can avoid it
         if default is not None:
             self.default = default
             self.type = self._dtypeortype(self._getdtype(default))
         else:
             try:
-                tester = func(testing_value or b'1')
+                tester = func(testing_value or '1')
             except (TypeError, ValueError):
                 tester = None
             self.type = self._dtypeortype(self._getdtype(tester))
-        # Add the missing values to the existing set
-        if missing_values is not None:
-            if _is_bytes_like(missing_values):
-                self.missing_values.add(missing_values)
-            elif hasattr(missing_values, '__iter__'):
-                for val in missing_values:
-                    self.missing_values.add(val)
+
+        # Add the missing values to the existing set or clear it.
+        if missing_values is None:
+            # Clear all missing values even though the ctor initializes it to
+            # set(['']) when the argument is None.
+            self.missing_values = set()
         else:
-            self.missing_values = []
+            if not np.iterable(missing_values):
+                missing_values = [missing_values]
+            if not all(isinstance(v, basestring) for v in missing_values):
+                raise TypeError("missing_values must be strings or unicode")
+            self.missing_values.update(missing_values)
 
 
 def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs):
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6de5940d7..e4d827334 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import io
 import sys
 import os
 import re
@@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits
 from ._iotools import (
     LineSplitter, NameValidator, StringConverter, ConverterError,
     ConverterLockError, ConversionWarning, _is_string_like,
-    has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name
+    has_nested_fields, flatten_dtype, easy_dtype, _decode_line
     )
 
 from numpy.compat import (
-    asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path
+    asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode,
+    is_pathlib_path
     )
 
 if sys.version_info[0] >= 3:
@@ -294,7 +296,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
         used in Python 3.
     encoding : str, optional
         What encoding to use when reading Python 2 strings. Only useful when
-        loading Python 2 generated pickled files on Python 3, which includes
+        loading Python 2 generated pickled files in Python 3, which includes
         npy/npz files containing object arrays. Values other than 'latin1',
         'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
         data. Default: 'ASCII'
@@ -731,8 +733,8 @@ def _getconv(dtype):
 
     def floatconv(x):
         x.lower()
-        if b'0x' in x:
-            return float.fromhex(asstr(x))
+        if '0x' in x:
+            return float.fromhex(x)
         return float(x)
 
     typ = dtype.type
@@ -752,13 +754,17 @@ def _getconv(dtype):
         return lambda x: complex(asstr(x))
     elif issubclass(typ, np.bytes_):
         return asbytes
+    elif issubclass(typ, np.unicode_):
+        return asunicode
     else:
         return asstr
 
+# amount of lines loadtxt reads in one chunk, can be overriden for testing
+_loadtxt_chunksize = 50000
 
 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             converters=None, skiprows=0, usecols=None, unpack=False,
-            ndmin=0):
+            ndmin=0, encoding='bytes'):
     """
     Load data from a text file.
 
@@ -776,13 +782,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         each row will be interpreted as an element of the array.  In this
         case, the number of columns used must match the number of fields in
         the data-type.
-    comments : str or sequence, optional
+    comments : str or sequence of str, optional
         The characters or list of characters used to indicate the start of a
-        comment;
-        default: '#'.
+        comment. For backwards compatibility, byte strings will be decoded as
+        'latin1'. The default is '#'.
     delimiter : str, optional
-        The string used to separate values.  By default, this is any
-        whitespace.
+        The string used to separate values. For backwards compatibility, byte
+        strings will be decoded as 'latin1'. The default is whitespace.
     converters : dict, optional
         A dictionary mapping column number to a function that will convert
         that column to a float.  E.g., if column 0 is a date string:
@@ -791,18 +797,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
     skiprows : int, optional
         Skip the first `skiprows` lines; default: 0.
-
     usecols : int or sequence, optional
         Which columns to read, with 0 being the first. For example,
         usecols = (1,4,5) will extract the 2nd, 5th and 6th columns.
         The default, None, results in all columns being read.
 
-        .. versionadded:: 1.11.0
-
-        Also when a single column has to be read it is possible to use
-        an integer instead of a tuple. E.g ``usecols = 3`` reads the
-        fourth column the same way as `usecols = (3,)`` would.
-
+        .. versionchanged:: 1.11.0
+            When a single column has to be read it is possible to use
+            an integer instead of a tuple. E.g ``usecols = 3`` reads the
+            fourth column the same way as `usecols = (3,)`` would.
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
         unpacked using ``x, y, z = loadtxt(...)``.  When used with a structured
@@ -813,6 +816,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         Legal values: 0 (default), 1 or 2.
 
         .. versionadded:: 1.6.0
+    encoding : str, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+        The special value 'bytes' enables backward compatibility workarounds
+        that ensures you receive byte arrays as results if possible and passes
+        latin1 encoded strings to converters. Override this value to receive
+        unicode arrays and pass strings as input to converters.  If set to None
+        the system default is used. The default value is 'bytes'.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -861,16 +873,22 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     # Type conversions for Py3 convenience
     if comments is not None:
         if isinstance(comments, (basestring, bytes)):
-            comments = [asbytes(comments)]
-        else:
-            comments = [asbytes(comment) for comment in comments]
-
+            comments = [comments]
+        comments = [_decode_line(x) for x in comments]
         # Compile regex for comments beforehand
         comments = (re.escape(comment) for comment in comments)
-        regex_comments = re.compile(b'|'.join(comments))
-    user_converters = converters
+        regex_comments = re.compile('|'.join(comments))
+
     if delimiter is not None:
-        delimiter = asbytes(delimiter)
+        delimiter = _decode_line(delimiter)
+
+    user_converters = converters
+
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
 
     if usecols is not None:
         # Allow usecols to be a single int or a sequence of ints
@@ -896,22 +914,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         if is_pathlib_path(fname):
             fname = str(fname)
         if _is_string_like(fname):
+            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+            fencoding = getattr(fh, 'encoding', 'latin1')
+            fh = iter(fh)
             fown = True
-            if fname.endswith('.gz'):
-                import gzip
-                fh = iter(gzip.GzipFile(fname))
-            elif fname.endswith('.bz2'):
-                import bz2
-                fh = iter(bz2.BZ2File(fname))
-            elif sys.version_info[0] == 2:
-                fh = iter(open(fname, 'U'))
-            else:
-                fh = iter(open(fname))
         else:
             fh = iter(fname)
+            fencoding = getattr(fname, 'encoding', 'latin1')
     except TypeError:
         raise ValueError('fname must be a string, file handle, or generator')
-    X = []
+
+    # input may be a python2 io stream
+    if encoding is not None:
+        fencoding = encoding
+    # we must assume local encoding
+    # TOOD emit portability warning?
+    elif fencoding is None:
+        import locale
+        fencoding = locale.getpreferredencoding()
 
     # not to be confused with the flatten_dtype we import...
     def flatten_dtype_internal(dt):
@@ -960,21 +980,53 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             return tuple(ret)
 
     def split_line(line):
-        """Chop off comments, strip, and split at delimiter.
+        """Chop off comments, strip, and split at delimiter. """
+        line = _decode_line(line, encoding=encoding)
 
-        Note that although the file is opened as text, this function
-        returns bytes.
-
-        """
-        line = asbytes(line)
         if comments is not None:
-            line = regex_comments.split(asbytes(line), maxsplit=1)[0]
-        line = line.strip(b'\r\n')
+            line = regex_comments.split(line, maxsplit=1)[0]
+        line = line.strip('\r\n')
         if line:
             return line.split(delimiter)
         else:
             return []
 
+    def read_data(chunk_size):
+        """Parse each line, including the first.
+
+        The file read, `fh`, is a global defined above.
+
+        Parameters
+        ----------
+        chunk_size : int
+            At most `chunk_size` lines are read at a time, with iteration
+            until all lines are read.
+
+        """
+        X = []
+        for i, line in enumerate(itertools.chain([first_line], fh)):
+            vals = split_line(line)
+            if len(vals) == 0:
+                continue
+            if usecols:
+                vals = [vals[j] for j in usecols]
+            if len(vals) != N:
+                line_num = i + skiprows + 1
+                raise ValueError("Wrong number of columns at line %d"
+                                 % line_num)
+
+            # Convert each value according to its column and store
+            items = [conv(val) for (conv, val) in zip(converters, vals)]
+
+            # Then pack it according to the dtype's nesting
+            items = pack_items(items, packing)
+            X.append(items)
+            if len(X) > chunk_size:
+                yield X
+                X = []
+        if X:
+            yield X
+
     try:
         # Make sure we're dealing with a proper dtype
         dtype = np.dtype(dtype)
@@ -1017,30 +1069,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                 except ValueError:
                     # Unused converter specified
                     continue
-            converters[i] = conv
-
-        # Parse each line, including the first
-        for i, line in enumerate(itertools.chain([first_line], fh)):
-            vals = split_line(line)
-            if len(vals) == 0:
-                continue
-            if usecols:
-                vals = [vals[j] for j in usecols]
-            if len(vals) != N:
-                line_num = i + skiprows + 1
-                raise ValueError("Wrong number of columns at line %d"
-                                 % line_num)
-
-            # Convert each value according to its column and store
-            items = [conv(val) for (conv, val) in zip(converters, vals)]
-            # Then pack it according to the dtype's nesting
-            items = pack_items(items, packing)
-            X.append(items)
+            if byte_converters:
+                # converters may use decode to workaround numpy's old behaviour,
+                # so encode the string again before passing to the user converter
+                def tobytes_first(x, conv):
+                    if type(x) is bytes:
+                        return conv(x)
+                    return conv(x.encode("latin1"))
+                import functools
+                converters[i] = functools.partial(tobytes_first, conv=conv)
+            else:
+                converters[i] = conv
+
+        converters = [conv if conv is not bytes else
+                      lambda x: x.encode(fencoding) for conv in converters]
+
+        # read data in chunks and fill it into an array via resize
+        # over-allocating and shrinking the array later may be faster but is
+        # probably not relevant compared to the cost of actually reading and
+        # converting the data
+        X = None
+        for x in read_data(_loadtxt_chunksize):
+            if X is None:
+                X = np.array(x, dtype)
+            else:
+                nshape = list(X.shape)
+                pos = nshape[0]
+                nshape[0] += len(x)
+                X.resize(nshape)
+                X[pos:, ...] = x
     finally:
         if fown:
             fh.close()
 
-    X = np.array(X, dtype)
+    if X is None:
+        X = np.array([], dtype)
+
     # Multicolumn data are returned with shape (1, N, M), i.e.
     # (1, 1, M) for a single row - remove the singleton dimension there
     if X.ndim == 3 and X.shape[:2] == (1, 1):
@@ -1072,7 +1136,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
 
 def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
-            footer='', comments='# '):
+            footer='', comments='# ', encoding=None):
     """
     Save an array to a text file.
 
@@ -1116,6 +1180,13 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         ``numpy.loadtxt``.
 
         .. versionadded:: 1.7.0
+    encoding : {None, str}, optional
+        Encoding used to encode the outputfile. Does not apply to output
+        streams. If the encoding is something other than 'bytes' or 'latin1'
+        you will not be able to load the file in NumPy versions < 1.14. Default
+        is 'latin1'.
+
+        .. versionadded:: 1.14.0
 
 
     See Also
@@ -1190,21 +1261,53 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         fmt = asstr(fmt)
     delimiter = asstr(delimiter)
 
+    class WriteWrap(object):
+        """Convert to unicode in py2 or to bytes on bytestream inputs.
+
+        """
+        def __init__(self, fh, encoding):
+            self.fh = fh
+            self.encoding = encoding
+            self.do_write = self.first_write
+
+        def close(self):
+            self.fh.close()
+
+        def write(self, v):
+            self.do_write(v)
+
+        def write_bytes(self, v):
+            if isinstance(v, bytes):
+                self.fh.write(v)
+            else:
+                self.fh.write(v.encode(self.encoding))
+
+        def write_normal(self, v):
+            self.fh.write(asunicode(v))
+
+        def first_write(self, v):
+            try:
+                self.write_normal(v)
+                self.write = self.write_normal
+            except TypeError:
+                # input is probably a bytestream
+                self.write_bytes(v)
+                self.write = self.write_bytes
+
     own_fh = False
     if is_pathlib_path(fname):
         fname = str(fname)
     if _is_string_like(fname):
+        # datasource doesn't support creating a new file ...
+        open(fname, 'wt').close()
+        fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
         own_fh = True
-        if fname.endswith('.gz'):
-            import gzip
-            fh = gzip.open(fname, 'wb')
-        else:
-            if sys.version_info[0] >= 3:
-                fh = open(fname, 'wb')
-            else:
-                fh = open(fname, 'w')
+        # need to convert str to unicode for text io output
+        if sys.version_info[0] == 2:
+            fh = WriteWrap(fh, encoding or 'latin1')
     elif hasattr(fname, 'write'):
-        fh = fname
+        # wrap to handle byte output streams
+        fh = WriteWrap(fname, encoding or 'latin1')
     else:
         raise ValueError('fname must be a string or file handle')
 
@@ -1254,31 +1357,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
 
         if len(header) > 0:
             header = header.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + header + newline))
+            fh.write(comments + header + newline)
         if iscomplex_X:
             for row in X:
                 row2 = []
                 for number in row:
                     row2.append(number.real)
                     row2.append(number.imag)
-                fh.write(asbytes(format % tuple(row2) + newline))
+                fh.write(format % tuple(row2) + newline)
         else:
             for row in X:
                 try:
-                    fh.write(asbytes(format % tuple(row) + newline))
+                    v = format % tuple(row) + newline
                 except TypeError:
                     raise TypeError("Mismatch between array dtype ('%s') and "
                                     "format specifier ('%s')"
                                     % (str(X.dtype), format))
+                fh.write(v)
+
         if len(footer) > 0:
             footer = footer.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + footer + newline))
+            fh.write(comments + footer + newline)
     finally:
         if own_fh:
             fh.close()
 
 
-def fromregex(file, regexp, dtype):
+def fromregex(file, regexp, dtype, encoding=None):
     """
     Construct an array from a text file, using regular expression parsing.
 
@@ -1295,6 +1400,10 @@ def fromregex(file, regexp, dtype):
         Groups in the regular expression correspond to fields in the dtype.
     dtype : dtype or list of dtypes
         Dtype for the structured array.
+    encoding : str, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1335,16 +1444,22 @@ def fromregex(file, regexp, dtype):
     """
     own_fh = False
     if not hasattr(file, "read"):
-        file = open(file, 'rb')
+        file = np.lib._datasource.open(file, 'rt', encoding=encoding)
         own_fh = True
 
     try:
-        if not hasattr(regexp, 'match'):
-            regexp = re.compile(asbytes(regexp))
         if not isinstance(dtype, np.dtype):
             dtype = np.dtype(dtype)
 
-        seq = regexp.findall(file.read())
+        content = file.read()
+        if isinstance(content, bytes) and not isinstance(regexp, bytes):
+            regexp = asbytes(regexp)
+        elif not isinstance(content, bytes) and isinstance(regexp, bytes):
+            regexp = asstr(regexp)
+
+        if not hasattr(regexp, 'match'):
+            regexp = re.compile(regexp)
+        seq = regexp.findall(content)
         if seq and not isinstance(seq[0], tuple):
             # Only one group is in the regexp.
             # Create the new array as a single data-type and then
@@ -1372,7 +1487,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                names=None, excludelist=None, deletechars=None,
                replace_space='_', autostrip=False, case_sensitive=True,
                defaultfmt="f%i", unpack=None, usemask=False, loose=True,
-               invalid_raise=True, max_rows=None):
+               invalid_raise=True, max_rows=None, encoding='bytes'):
     """
     Load data from a text file, with missing values handled as specified.
 
@@ -1460,6 +1575,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         to read the entire file.
 
         .. versionadded:: 1.10.0
+    encoding : str, optional
+        Encoding used to decode the inputfile. Does not apply when `fname` is
+        a file object.  The special value 'bytes' enables backward compatibility
+        workarounds that ensure that you receive byte arrays when possible
+        and passes latin1 encoded strings to converters. Override this value to
+        receive unicode arrays and pass strings as input to converters.  If set
+        to None the system default is used. The default value is 'bytes'.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1536,15 +1660,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         if max_rows < 1:
             raise ValueError("'max_rows' must be at least 1.")
 
-    # Py3 data conversions to bytes, for convenience
-    if comments is not None:
-        comments = asbytes(comments)
-    if isinstance(delimiter, unicode):
-        delimiter = asbytes(delimiter)
-    if isinstance(missing_values, (unicode, list, tuple)):
-        missing_values = asbytes_nested(missing_values)
-
-    #
     if usemask:
         from numpy.ma import MaskedArray, make_mask_descr
     # Check the input dictionary of converters
@@ -1554,16 +1669,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "The input argument 'converter' should be a valid dictionary "
             "(got '%s' instead)" % type(user_converters))
 
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
+
     # Initialize the filehandle, the LineSplitter and the NameValidator
     own_fhd = False
     try:
         if is_pathlib_path(fname):
             fname = str(fname)
         if isinstance(fname, basestring):
-            if sys.version_info[0] == 2:
-                fhd = iter(np.lib._datasource.open(fname, 'rbU'))
-            else:
-                fhd = iter(np.lib._datasource.open(fname, 'rb'))
+            fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding))
             own_fhd = True
         else:
             fhd = iter(fname)
@@ -1573,7 +1691,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "or generator. Got %s instead." % type(fname))
 
     split_line = LineSplitter(delimiter=delimiter, comments=comments,
-                              autostrip=autostrip)._handyman
+                              autostrip=autostrip, encoding=encoding)
     validate_names = NameValidator(excludelist=excludelist,
                                    deletechars=deletechars,
                                    case_sensitive=case_sensitive,
@@ -1587,15 +1705,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     first_values = None
     try:
         while not first_values:
-            first_line = next(fhd)
+            first_line = _decode_line(next(fhd), encoding)
             if names is True:
                 if comments in first_line:
                     first_line = (
-                        b''.join(first_line.split(comments)[1:]))
+                        ''.join(first_line.split(comments)[1:]))
             first_values = split_line(first_line)
     except StopIteration:
         # return an empty array if the datafile is empty
-        first_line = b''
+        first_line = ''
         first_values = []
         warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
 
@@ -1618,9 +1736,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     # Check the names and overwrite the dtype.names if needed
     if names is True:
-        names = validate_names([_bytes_to_name(_.strip())
-                                for _ in first_values])
-        first_line = b''
+        names = validate_names([str(_.strip()) for _ in first_values])
+        first_line = ''
     elif _is_string_like(names):
         names = validate_names([_.strip() for _ in names.split(',')])
     elif names:
@@ -1657,9 +1774,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     # Process the missing values ...............................
     # Rename missing_values for convenience
     user_missing_values = missing_values or ()
+    if isinstance(user_missing_values, bytes):
+        user_missing_values = user_missing_values.decode('latin1')
 
     # Define the list of missing_values (one column: one list)
-    missing_values = [list([b'']) for _ in range(nbcols)]
+    missing_values = [list(['']) for _ in range(nbcols)]
 
     # We have a dictionary: process it field by field
     if isinstance(user_missing_values, dict):
@@ -1698,8 +1817,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             if value not in entry:
                 entry.append(value)
     # We have a string : apply it to all entries
-    elif isinstance(user_missing_values, bytes):
-        user_value = user_missing_values.split(b",")
+    elif isinstance(user_missing_values, basestring):
+        user_value = user_missing_values.split(",")
         for entry in missing_values:
             entry.extend(user_value)
     # We have something else: apply it to all entries
@@ -1787,16 +1906,29 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             testing_value = first_values[j]
         else:
             testing_value = None
-        converters[i].update(conv, locked=True,
+        if conv is bytes:
+            user_conv = asbytes
+        elif byte_converters:
+            # converters may use decode to workaround numpy's old behaviour,
+            # so encode the string again before passing to the user converter
+            def tobytes_first(x, conv):
+                if type(x) is bytes:
+                    return conv(x)
+                return conv(x.encode("latin1"))
+            import functools
+            user_conv = functools.partial(tobytes_first, conv=conv)
+        else:
+            user_conv = conv
+        converters[i].update(user_conv, locked=True,
                              testing_value=testing_value,
                              default=filling_values[i],
                              missing_values=missing_values[i],)
-        uc_update.append((i, conv))
+        uc_update.append((i, user_conv))
     # Make sure we have the corrected keys in user_converters...
     user_converters.update(uc_update)
 
     # Fixme: possible error as following variable never used.
-    #miss_chars = [_.missing_values for _ in converters]
+    # miss_chars = [_.missing_values for _ in converters]
 
     # Initialize the output lists ...
     # ... rows
@@ -1908,16 +2040,42 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         column_types = [conv.type for conv in converters]
         # Find the columns with strings...
         strcolidx = [i for (i, v) in enumerate(column_types)
-                     if v in (type('S'), np.string_)]
+                     if v == np.unicode_]
+
+        type_str = np.unicode_
+        if byte_converters and strcolidx:
+            # convert strings back to bytes for backward compatibility
+            warnings.warn(
+                "Reading unicode strings without specifying the encoding "
+                "argument is deprecated. Set the encoding, use None for the "
+                "system default.",
+                np.VisibleDeprecationWarning, stacklevel=2)
+            def encode_unicode_cols(row_tup):
+                row = list(row_tup)
+                for i in strcolidx:
+                    row[i] = row[i].encode('latin1')
+                return tuple(row)
+
+            try:
+                data = [encode_unicode_cols(r) for r in data]
+                type_str = np.bytes_
+            except UnicodeEncodeError:
+                pass
+
+
         # ... and take the largest number of chars.
         for i in strcolidx:
-            column_types[i] = "|S%i" % max(len(row[i]) for row in data)
+            max_line_length = max(len(row[i]) for row in data)
+            column_types[i] = np.dtype((type_str, max_line_length))
         #
         if names is None:
             # If the dtype is uniform, don't define names, else use ''
             base = set([c.type for c in converters if c._checked])
             if len(base) == 1:
-                (ddtype, mdtype) = (list(base)[0], bool)
+                if strcolidx:
+                    (ddtype, mdtype) = (type_str, bool)
+                else:
+                    (ddtype, mdtype) = (list(base)[0], bool)
             else:
                 ddtype = [(defaultfmt % i, dt)
                           for (i, dt) in enumerate(column_types)]
@@ -1966,8 +2124,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                     # Keep the dtype of the current converter
                     if i in user_converters:
                         ishomogeneous &= (ttype == dtype.type)
-                        if ttype == np.string_:
-                            ttype = "|S%i" % max(len(row[i]) for row in data)
+                        if np.issubdtype(ttype, np.character):
+                            ttype = (ttype, max(len(row[i]) for row in data))
                         descr.append(('', ttype))
                     else:
                         descr.append(('', dtype))
@@ -1990,9 +2148,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     # Try to take care of the missing data we missed
     names = output.dtype.names
     if usemask and names:
-        for (name, conv) in zip(names or (), converters):
+        for (name, conv) in zip(names, converters):
             missing_values = [conv(_) for _ in conv.missing_values
-                              if _ != b'']
+                              if _ != '']
             for mval in missing_values:
                 outputmask[name] |= (output[name] == mval)
     # Construct the final array
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 03192896c..54fac8da4 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -12,6 +12,7 @@ from numpy.lib._iotools import (
     LineSplitter, NameValidator, StringConverter,
     has_nested_fields, easy_dtype, flatten_dtype
     )
+from numpy.compat import unicode
 
 
 class TestLineSplitter(object):
@@ -19,61 +20,61 @@ class TestLineSplitter(object):
 
     def test_no_delimiter(self):
         "Test LineSplitter w/o delimiter"
-        strg = b" 1 2 3 4  5 # test"
+        strg = " 1 2 3 4  5 # test"
         test = LineSplitter()(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5'])
+        assert_equal(test, ['1', '2', '3', '4', '5'])
         test = LineSplitter('')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5'])
+        assert_equal(test, ['1', '2', '3', '4', '5'])
 
     def test_space_delimiter(self):
         "Test space delimiter"
-        strg = b" 1 2 3 4  5 # test"
-        test = LineSplitter(b' ')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
-        test = LineSplitter(b'  ')(strg)
-        assert_equal(test, [b'1 2 3 4', b'5'])
+        strg = " 1 2 3 4  5 # test"
+        test = LineSplitter(' ')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+        test = LineSplitter('  ')(strg)
+        assert_equal(test, ['1 2 3 4', '5'])
 
     def test_tab_delimiter(self):
         "Test tab delimiter"
-        strg = b" 1\t 2\t 3\t 4\t 5  6"
-        test = LineSplitter(b'\t')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'5  6'])
-        strg = b" 1  2\t 3  4\t 5  6"
-        test = LineSplitter(b'\t')(strg)
-        assert_equal(test, [b'1  2', b'3  4', b'5  6'])
+        strg = " 1\t 2\t 3\t 4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '5  6'])
+        strg = " 1  2\t 3  4\t 5  6"
+        test = LineSplitter('\t')(strg)
+        assert_equal(test, ['1  2', '3  4', '5  6'])
 
     def test_other_delimiter(self):
         "Test LineSplitter on delimiter"
-        strg = b"1,2,3,4,,5"
-        test = LineSplitter(b',')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
+        strg = "1,2,3,4,,5"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
         #
-        strg = b" 1,2,3,4,,5 # test"
-        test = LineSplitter(b',')(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5'])
+        strg = " 1,2,3,4,,5 # test"
+        test = LineSplitter(',')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
 
     def test_constant_fixed_width(self):
         "Test LineSplitter w/ fixed-width fields"
-        strg = b"  1  2  3  4     5   # test"
+        strg = "  1  2  3  4     5   # test"
         test = LineSplitter(3)(strg)
-        assert_equal(test, [b'1', b'2', b'3', b'4', b'', b'5', b''])
+        assert_equal(test, ['1', '2', '3', '4', '', '5', ''])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter(20)(strg)
-        assert_equal(test, [b'1     3  4  5  6'])
+        assert_equal(test, ['1     3  4  5  6'])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter(30)(strg)
-        assert_equal(test, [b'1     3  4  5  6'])
+        assert_equal(test, ['1     3  4  5  6'])
 
     def test_variable_fixed_width(self):
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter((3, 6, 6, 3))(strg)
-        assert_equal(test, [b'1', b'3', b'4  5', b'6'])
+        assert_equal(test, ['1', '3', '4  5', '6'])
         #
-        strg = b"  1     3  4  5  6# test"
+        strg = "  1     3  4  5  6# test"
         test = LineSplitter((6, 6, 9))(strg)
-        assert_equal(test, [b'1', b'3  4', b'5  6'])
+        assert_equal(test, ['1', '3  4', '5  6'])
 
 # -----------------------------------------------------------------------------
 
@@ -133,10 +134,7 @@ class TestNameValidator(object):
 
 
 def _bytes_to_date(s):
-    if sys.version_info[0] >= 3:
-        return date(*time.strptime(s.decode('latin1'), "%Y-%m-%d")[:3])
-    else:
-        return date(*time.strptime(s, "%Y-%m-%d")[:3])
+    return date(*time.strptime(s, "%Y-%m-%d")[:3])
 
 
 class TestStringConverter(object):
@@ -155,39 +153,45 @@ class TestStringConverter(object):
         assert_equal(converter._status, 0)
 
         # test int
-        assert_equal(converter.upgrade(b'0'), 0)
+        assert_equal(converter.upgrade('0'), 0)
         assert_equal(converter._status, 1)
 
-        # On systems where integer defaults to 32-bit, the statuses will be
+        # On systems where long defaults to 32-bit, the statuses will be
         # offset by one, so we check for this here.
         import numpy.core.numeric as nx
-        status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize)
+        status_offset = int(nx.dtype(nx.int_).itemsize < nx.dtype(nx.int64).itemsize)
 
         # test int > 2**32
-        assert_equal(converter.upgrade(b'17179869184'), 17179869184)
+        assert_equal(converter.upgrade('17179869184'), 17179869184)
         assert_equal(converter._status, 1 + status_offset)
 
         # test float
-        assert_allclose(converter.upgrade(b'0.'), 0.0)
+        assert_allclose(converter.upgrade('0.'), 0.0)
         assert_equal(converter._status, 2 + status_offset)
 
         # test complex
-        assert_equal(converter.upgrade(b'0j'), complex('0j'))
+        assert_equal(converter.upgrade('0j'), complex('0j'))
         assert_equal(converter._status, 3 + status_offset)
 
         # test str
-        assert_equal(converter.upgrade(b'a'), b'a')
-        assert_equal(converter._status, len(converter._mapper) - 1)
+        # note that the longdouble type has been skipped, so the
+        # _status increases by 2. Everything should succeed with
+        # unicode conversion (5).
+        for s in ['a', u'a', b'a']:
+            res = converter.upgrade(s)
+            assert_(type(res) is unicode)
+            assert_equal(res, u'a')
+            assert_equal(converter._status, 5 + status_offset)
 
     def test_missing(self):
         "Tests the use of missing values."
-        converter = StringConverter(missing_values=(b'missing',
-                                                    b'missed'))
-        converter.upgrade(b'0')
-        assert_equal(converter(b'0'), 0)
-        assert_equal(converter(b''), converter.default)
-        assert_equal(converter(b'missing'), converter.default)
-        assert_equal(converter(b'missed'), converter.default)
+        converter = StringConverter(missing_values=('missing',
+                                                    'missed'))
+        converter.upgrade('0')
+        assert_equal(converter('0'), 0)
+        assert_equal(converter(''), converter.default)
+        assert_equal(converter('missing'), converter.default)
+        assert_equal(converter('missed'), converter.default)
         try:
             converter('miss')
         except ValueError:
@@ -198,58 +202,59 @@ class TestStringConverter(object):
         dateparser = _bytes_to_date
         StringConverter.upgrade_mapper(dateparser, date(2000, 1, 1))
         convert = StringConverter(dateparser, date(2000, 1, 1))
-        test = convert(b'2001-01-01')
+        test = convert('2001-01-01')
         assert_equal(test, date(2001, 1, 1))
-        test = convert(b'2009-01-01')
+        test = convert('2009-01-01')
         assert_equal(test, date(2009, 1, 1))
-        test = convert(b'')
+        test = convert('')
         assert_equal(test, date(2000, 1, 1))
 
     def test_string_to_object(self):
         "Make sure that string-to-object functions are properly recognized"
+        old_mapper = StringConverter._mapper[:]  # copy of list
         conv = StringConverter(_bytes_to_date)
-        assert_equal(conv._mapper[-2][0](0), 0j)
+        assert_equal(conv._mapper, old_mapper)
         assert_(hasattr(conv, 'default'))
 
     def test_keep_default(self):
         "Make sure we don't lose an explicit default"
-        converter = StringConverter(None, missing_values=b'',
+        converter = StringConverter(None, missing_values='',
                                     default=-999)
-        converter.upgrade(b'3.14159265')
+        converter.upgrade('3.14159265')
         assert_equal(converter.default, -999)
         assert_equal(converter.type, np.dtype(float))
         #
         converter = StringConverter(
-            None, missing_values=b'', default=0)
-        converter.upgrade(b'3.14159265')
+            None, missing_values='', default=0)
+        converter.upgrade('3.14159265')
         assert_equal(converter.default, 0)
         assert_equal(converter.type, np.dtype(float))
 
     def test_keep_default_zero(self):
         "Check that we don't lose a default of 0"
         converter = StringConverter(int, default=0,
-                                    missing_values=b"N/A")
+                                    missing_values="N/A")
         assert_equal(converter.default, 0)
 
     def test_keep_missing_values(self):
         "Check that we're not losing missing values"
         converter = StringConverter(int, default=0,
-                                    missing_values=b"N/A")
+                                    missing_values="N/A")
         assert_equal(
-            converter.missing_values, set([b'', b'N/A']))
+            converter.missing_values, set(['', 'N/A']))
 
     def test_int64_dtype(self):
         "Check that int64 integer types can be specified"
         converter = StringConverter(np.int64, default=0)
-        val = b"-9223372036854775807"
+        val = "-9223372036854775807"
         assert_(converter(val) == -9223372036854775807)
-        val = b"9223372036854775807"
+        val = "9223372036854775807"
         assert_(converter(val) == 9223372036854775807)
 
     def test_uint64_dtype(self):
         "Check that uint64 integer types can be specified"
         converter = StringConverter(np.uint64, default=0)
-        val = b"9223372043271415339"
+        val = "9223372043271415339"
         assert_(converter(val) == 9223372043271415339)
 
 
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 6f7fcc54c..75a8e4968 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -8,8 +8,11 @@ from tempfile import NamedTemporaryFile
 import time
 import warnings
 import gc
-from io import BytesIO
+import io
+from io import BytesIO, StringIO
 from datetime import datetime
+import locale
+import re
 
 import numpy as np
 import numpy.ma as ma
@@ -17,9 +20,9 @@ from numpy.lib._iotools import ConverterError, ConversionWarning
 from numpy.compat import asbytes, bytes, unicode, Path
 from numpy.ma.testutils import assert_equal
 from numpy.testing import (
-    run_module_suite, assert_warns, assert_, assert_raises_regex,
-    assert_raises, assert_allclose, assert_array_equal, temppath, dec, IS_PYPY,
-    suppress_warnings
+    run_module_suite, assert_warns, assert_, SkipTest,
+    assert_raises_regex, assert_raises, assert_allclose,
+    assert_array_equal, temppath, tempdir, dec, IS_PYPY, suppress_warnings
 )
 
 
@@ -44,6 +47,16 @@ class TextIO(BytesIO):
 
 MAJVER, MINVER = sys.version_info[:2]
 IS_64BIT = sys.maxsize > 2**32
+try:
+    import bz2
+    HAS_BZ2 = True
+except ImportError:
+    HAS_BZ2 = False
+try:
+    import lzma
+    HAS_LZMA = True
+except ImportError:
+    HAS_LZMA = False
 
 
 def strptime(s, fmt=None):
@@ -52,10 +65,9 @@ def strptime(s, fmt=None):
     2.5.
 
     """
-    if sys.version_info[0] >= 3:
-        return datetime(*time.strptime(s.decode('latin1'), fmt)[:3])
-    else:
-        return datetime(*time.strptime(s, fmt)[:3])
+    if type(s) == bytes:
+        s = s.decode("latin1")
+    return datetime(*time.strptime(s, fmt)[:3])
 
 
 class RoundtripTest(object):
@@ -144,7 +156,7 @@ class RoundtripTest(object):
         a = np.array([1, 2, 3, 4], int)
         self.roundtrip(a)
 
-    @np.testing.dec.knownfailureif(sys.platform == 'win32', "Fail on Win32")
+    @dec.knownfailureif(sys.platform == 'win32', "Fail on Win32")
     def test_mmap(self):
         a = np.array([[1, 2.5], [4, 7.3]])
         self.roundtrip(a, file_on_disk=True, load_kwds={'mmap_mode': 'r'})
@@ -188,8 +200,8 @@ class TestSavezLoad(RoundtripTest):
                 self.arr_reloaded.fid.close()
                 os.remove(self.arr_reloaded.fid.name)
 
-    @np.testing.dec.skipif(not IS_64BIT, "Works only with 64bit systems")
-    @np.testing.dec.slow
+    @dec.skipif(not IS_64BIT, "Works only with 64bit systems")
+    @dec.slow
     def test_big_arrays(self):
         L = (1 << 31) + 100000
         a = np.empty(L, dtype=np.uint8)
@@ -265,7 +277,7 @@ class TestSavezLoad(RoundtripTest):
                 fp.seek(0)
                 assert_(not fp.closed)
 
-    @np.testing.dec.skipif(IS_PYPY, "context manager required on PyPy")
+    @dec.skipif(IS_PYPY, "context manager required on PyPy")
     def test_closing_fid(self):
         # Test that issue #1517 (too many opened files) remains closed
         # It might be a "weak" test since failed to get triggered on
@@ -331,8 +343,8 @@ class TestSaveTxt(object):
 
     def test_0D_3D(self):
         c = BytesIO()
-        assert_raises(ValueError, np.savetxt, c, np.array(1)) 
-        assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]])) 
+        assert_raises(ValueError, np.savetxt, c, np.array(1))
+        assert_raises(ValueError, np.savetxt, c, np.array([[[1], [2]]]))
 
 
     def test_record(self):
@@ -466,8 +478,134 @@ class TestSaveTxt(object):
         b = np.loadtxt(w)
         assert_array_equal(a, b)
 
+    def test_unicode(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        with tempdir() as tmpdir:
+            # set encoding as on windows it may not be unicode even on py3
+            np.savetxt(os.path.join(tmpdir, 'test.csv'), a, fmt=['%s'],
+                       encoding='UTF-8')
+
+    def test_unicode_roundtrip(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        # our gz wrapper support encoding
+        suffixes = ['', '.gz']
+        # stdlib 2 versions do not support encoding
+        if MAJVER > 2:
+            if HAS_BZ2:
+                suffixes.append('.bz2')
+            if HAS_LZMA:
+                suffixes.extend(['.xz', '.lzma'])
+        with tempdir() as tmpdir:
+            for suffix in suffixes:
+                np.savetxt(os.path.join(tmpdir, 'test.csv' + suffix), a,
+                           fmt=['%s'], encoding='UTF-16-LE')
+                b = np.loadtxt(os.path.join(tmpdir, 'test.csv' + suffix),
+                               encoding='UTF-16-LE', dtype=np.unicode)
+                assert_array_equal(a, b)
+
+    def test_unicode_bytestream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = BytesIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read().decode('UTF-8'), utf8 + '\n')
+
+    def test_unicode_stringstream(self):
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        a = np.array([utf8], dtype=np.unicode)
+        s = StringIO()
+        np.savetxt(s, a, fmt=['%s'], encoding='UTF-8')
+        s.seek(0)
+        assert_equal(s.read(), utf8 + '\n')
+
+
+class LoadTxtBase(object):
+    def check_compressed(self, fopen, suffixes):
+        # Test that we can load data from a compressed file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            for suffix in suffixes:
+                with temppath(suffix=suffix) as name:
+                    with fopen(name, mode='wt', encoding='UTF-32-LE') as f:
+                        f.write(data)
+                    res = self.loadfunc(name, encoding='UTF-32-LE')
+                    assert_array_equal(res, wanted)
+                    with fopen(name, "rt",  encoding='UTF-32-LE') as f:
+                        res = self.loadfunc(f)
+                    assert_array_equal(res, wanted)
+
+    # Python2 .open does not support encoding
+    @dec.skipif(MAJVER == 2)
+    def test_compressed_gzip(self):
+        self.check_compressed(gzip.open, ('.gz',))
+
+    @dec.skipif(MAJVER == 2 or not HAS_BZ2)
+    def test_compressed_gzip(self):
+        self.check_compressed(bz2.open, ('.bz2',))
+
+    @dec.skipif(MAJVER == 2 or not HAS_LZMA)
+    def test_compressed_gzip(self):
+        self.check_compressed(lzma.open, ('.xz', '.lzma'))
+
+    def test_encoding(self):
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write('0.\n1.\n2.'.encode("UTF-16"))
+            x = self.loadfunc(path, encoding="UTF-16")
+            assert_array_equal(x, [0., 1., 2.])
+
+    def test_stringload(self):
+        # umlaute
+        nonascii = b'\xc3\xb6\xc3\xbc\xc3\xb6'.decode("UTF-8")
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(nonascii.encode("UTF-16"))
+            x = self.loadfunc(path, encoding="UTF-16", dtype=np.unicode)
+            assert_array_equal(x, nonascii)
+
+    def test_binary_decode(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = self.loadfunc(BytesIO(utf16), dtype=np.unicode, encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_converters_decode(self):
+        # test converters that decode strings
+        c = TextIO()
+        c.write(b'\xcf\x96')
+        c.seek(0)
+        x = self.loadfunc(c, dtype=np.unicode,
+                          converters={0: lambda x: x.decode('UTF-8')})
+        a = np.array([b'\xcf\x96'.decode('UTF-8')])
+        assert_array_equal(x, a)
+
+    def test_converters_nodecode(self):
+        # test native string converters enabled by setting an encoding
+        utf8 = b'\xcf\x96'.decode('UTF-8')
+        with temppath() as path:
+            with io.open(path, 'wt', encoding='UTF-8') as f:
+                f.write(utf8)
+            x = self.loadfunc(path, dtype=np.unicode,
+                              converters={0: lambda x: x + 't'},
+                              encoding='UTF-8')
+            a = np.array([utf8 + 't'])
+            assert_array_equal(x, a)
+
+
+class TestLoadTxt(LoadTxtBase):
+    loadfunc = staticmethod(np.loadtxt)
+
+    def setUp(self):
+        # lower chunksize for testing
+        self.orig_chunk = np.lib.npyio._loadtxt_chunksize
+        np.lib.npyio._loadtxt_chunksize = 1
+    def tearDown(self):
+        np.lib.npyio._loadtxt_chunksize = self.orig_chunk
 
-class TestLoadTxt(object):
     def test_record(self):
         c = TextIO()
         c.write('1 2\n3 4')
@@ -869,9 +1007,24 @@ class TestLoadTxt(object):
         dt = np.dtype([('x', int), ('a', 'S10'), ('y', int)])
         np.loadtxt(c, delimiter=',', dtype=dt, comments=None)  # Should succeed
 
+    @dec.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968')
+    def test_binary_load(self):
+        butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\
+                b"20,2,3,\xc3\x95scar\n\r"
+        sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines()
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write(butf8)
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype=np.unicode)
+            assert_array_equal(x, sutf8)
+            # test broken latin1 conversion people now rely on
+            with open(path, "rb") as f:
+                x = np.loadtxt(f, encoding="UTF-8", dtype="S")
+            x = [b'5,6,7,\xc3\x95scarscar', b'15,2,3,hello', b'20,2,3,\xc3\x95scar']
+            assert_array_equal(x, np.array(x, dtype="S"))
 
 class Testfromregex(object):
-    # np.fromregex expects files opened in binary mode.
     def test_record(self):
         c = TextIO()
         c.write('1.312 foo\n1.534 bar\n4.444 qux')
@@ -904,12 +1057,29 @@ class Testfromregex(object):
         a = np.array([(1312,), (1534,), (4444,)], dtype=dt)
         assert_array_equal(x, a)
 
+    def test_record_unicode(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'1.312 foo' + utf8 + b' \n1.534 bar\n4.444 qux')
+
+            dt = [('num', np.float64), ('val', 'U4')]
+            x = np.fromregex(path, r"(?u)([0-9.]+)\s+(\w+)", dt, encoding='UTF-8')
+            a = np.array([(1.312, 'foo' + utf8.decode('UTF-8')), (1.534, 'bar'),
+                           (4.444, 'qux')], dtype=dt)
+            assert_array_equal(x, a)
+
+            regexp = re.compile(r"([0-9.]+)\s+(\w+)", re.UNICODE)
+            x = np.fromregex(path, regexp, dt, encoding='UTF-8')
+            assert_array_equal(x, a)
+
 
 #####--------------------------------------------------------------------------
 
 
-class TestFromTxt(object):
-    #
+class TestFromTxt(LoadTxtBase):
+    loadfunc = staticmethod(np.genfromtxt)
+
     def test_record(self):
         # Test w/ explicit dtype
         data = TextIO('1 2\n3 4')
@@ -1012,7 +1182,10 @@ class TestFromTxt(object):
     def test_header(self):
         # Test retrieving a header
         data = TextIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0')
-        test = np.ndfromtxt(data, dtype=None, names=True)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None, names=True)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = {'gender': np.array([b'M', b'F']),
                    'age': np.array([64.0, 25.0]),
                    'weight': np.array([75.0, 60.0])}
@@ -1023,7 +1196,10 @@ class TestFromTxt(object):
     def test_auto_dtype(self):
         # Test the automatic definition of the output dtype
         data = TextIO('A 64 75.0 3+4j True\nBCD 25 60.0 5+6j False')
-        test = np.ndfromtxt(data, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = [np.array([b'A', b'BCD']),
                    np.array([64, 25]),
                    np.array([75.0, 60.0]),
@@ -1069,7 +1245,10 @@ F   35  58.330000
 M   33  21.99
         """)
         # The # is part of the first name and should be deleted automatically.
-        test = np.genfromtxt(data, names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('M', 21, 72.1), ('F', 35, 58.33), ('M', 33, 21.99)],
                         dtype=[('gender', '|S1'), ('age', int), ('weight', float)])
         assert_equal(test, ctrl)
@@ -1080,14 +1259,20 @@ M   21  72.100000
 F   35  58.330000
 M   33  21.99
         """)
-        test = np.genfromtxt(data, names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(data, names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test, ctrl)
 
     def test_autonames_and_usecols(self):
         # Tests names and usecols
         data = TextIO('A B C D\n aaaa 121 45 9.1')
-        test = np.ndfromtxt(data, usecols=('A', 'C', 'D'),
-                            names=True, dtype=None)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'),
+                                names=True, dtype=None)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = np.array(('aaaa', 45, 9.1),
                            dtype=[('A', '|S4'), ('C', int), ('D', float)])
         assert_equal(test, control)
@@ -1104,8 +1289,12 @@ M   33  21.99
     def test_converters_with_usecols_and_names(self):
         # Tests names and usecols
         data = TextIO('A B C D\n aaaa 121 45 9.1')
-        test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True,
-                            dtype=None, converters={'C': lambda s: 2 * int(s)})
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.ndfromtxt(data, usecols=('A', 'C', 'D'), names=True,
+                                dtype=None,
+                                converters={'C': lambda s: 2 * int(s)})
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         control = np.array(('aaaa', 90, 9.1),
                            dtype=[('A', '|S4'), ('C', int), ('D', float)])
         assert_equal(test, control)
@@ -1225,6 +1414,18 @@ M   33  21.99
                            dtype=[('', '|S10'), ('', float)])
         assert_equal(test, control)
 
+    def test_utf8_userconverters_with_explicit_dtype(self):
+        utf8 = b'\xcf\x96'
+        with temppath() as path:
+            with open(path, 'wb') as f:
+                f.write(b'skip,skip,2001-01-01' + utf8 + b',1.0,skip')
+            test = np.genfromtxt(path, delimiter=",", names=None, dtype=float,
+                                 usecols=(2, 3), converters={2: np.unicode},
+                                 encoding='UTF-8')
+        control = np.array([('2001-01-01' + utf8.decode('UTF-8'), 1.)],
+                           dtype=[('', '|U11'), ('', float)])
+        assert_equal(test, control)
+
     def test_spacedelimiter(self):
         # Test space delimiter
         data = TextIO("1  2  3  4   5\n6  7  8  9  10")
@@ -1551,11 +1752,17 @@ M   33  21.99
         # Test autostrip
         data = "01/01/2003  , 1.3,   abcde"
         kwargs = dict(delimiter=",", dtype=None)
-        mtest = np.ndfromtxt(TextIO(data), **kwargs)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('01/01/2003  ', 1.3, '   abcde')],
                         dtype=[('f0', '|S12'), ('f1', float), ('f2', '|S8')])
         assert_equal(mtest, ctrl)
-        mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            mtest = np.ndfromtxt(TextIO(data), autostrip=True, **kwargs)
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         ctrl = np.array([('01/01/2003', 1.3, 'abcde')],
                         dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')])
         assert_equal(mtest, ctrl)
@@ -1675,13 +1882,127 @@ M   33  21.99
 
     def test_comments_is_none(self):
         # Github issue 329 (None was previously being converted to 'None').
-        test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"),
-                             dtype=None, comments=None, delimiter=',')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1,testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test[1], b'testNonetherestofthedata')
-        test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"),
-                             dtype=None, comments=None, delimiter=',')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO("test1, testNonetherestofthedata"),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
         assert_equal(test[1], b' testNonetherestofthedata')
 
+    def test_latin1(self):
+        latin1 = b'\xf6\xfc\xf6'
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + latin1 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test[1, 0], b"test1")
+        assert_equal(test[1, 1], b"testNonethe" + latin1)
+        assert_equal(test[1, 2], b"test3")
+        test = np.genfromtxt(TextIO(s),
+                             dtype=None, comments=None, delimiter=',',
+                             encoding='latin1')
+        assert_equal(test[1, 0], u"test1")
+        assert_equal(test[1, 1], u"testNonethe" + latin1.decode('latin1'))
+        assert_equal(test[1, 2], u"test3")
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(b"0,testNonethe" + latin1),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        assert_equal(test['f0'], 0)
+        assert_equal(test['f1'], b"testNonethe" + latin1)
+
+    def test_binary_decode_autodtype(self):
+        utf16 = b'\xff\xfeh\x04 \x00i\x04 \x00j\x04'
+        v = self.loadfunc(BytesIO(utf16), dtype=None, encoding='UTF-16')
+        assert_array_equal(v, np.array(utf16.decode('UTF-16').split()))
+
+    def test_utf8_byte_encoding(self):
+        utf8 = b"\xcf\x96"
+        norm = b"norm1,norm2,norm3\n"
+        enc = b"test1,testNonethe" + utf8 + b",test3\n"
+        s = norm + enc + norm
+        with warnings.catch_warnings(record=True) as w:
+            warnings.filterwarnings('always', '', np.VisibleDeprecationWarning)
+            test = np.genfromtxt(TextIO(s),
+                                 dtype=None, comments=None, delimiter=',')
+            assert_(w[0].category is np.VisibleDeprecationWarning)
+        ctl = np.array([
+                 [b'norm1', b'norm2', b'norm3'],
+                 [b'test1', b'testNonethe' + utf8, b'test3'],
+                 [b'norm1', b'norm2', b'norm3']])
+        assert_array_equal(test, ctl)
+
+    def test_utf8_file(self):
+        utf8 = b"\xcf\x96"
+        latin1 = b"\xf6\xfc\xf6"
+        with temppath() as path:
+            with open(path, "wb") as f:
+                f.write((b"test1,testNonethe" + utf8 + b",test3\n") * 2)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            ctl = np.array([
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"],
+                     ["test1", "testNonethe" + utf8.decode("UTF-8"), "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
+            # test a mixed dtype
+            with open(path, "wb") as f:
+                f.write(b"0,testNonethe" + utf8)
+            test = np.genfromtxt(path, dtype=None, comments=None,
+                                 delimiter=',', encoding="UTF-8")
+            assert_equal(test['f0'], 0)
+            assert_equal(test['f1'], "testNonethe" + utf8.decode("UTF-8"))
+
+
+    def test_utf8_file_nodtype_unicode(self):
+        # bytes encoding with non-latin1 -> unicode upcast
+        utf8 = u'\u03d6'
+        latin1 = u'\xf6\xfc\xf6'
+
+        # skip test if cannot encode utf8 test string with preferred
+        # encoding. The preferred encoding is assumed to be the default
+        # encoding of io.open. Will need to change this for PyTest, maybe
+        # using pytest.mark.xfail(raises=***).
+        try:
+            import locale
+            encoding = locale.getpreferredencoding()
+            utf8.encode(encoding)
+        except (UnicodeError, ImportError):
+            raise SkipTest('Skipping test_utf8_file_nodtype_unicode, '
+                           'unable to encode utf8 in preferred encoding') 
+
+        with temppath() as path:
+            with io.open(path, "wt") as f:
+                f.write(u"norm1,norm2,norm3\n")
+                f.write(u"norm1," + latin1 + u",norm3\n")
+                f.write(u"test1,testNonethe" + utf8 + u",test3\n")
+            with warnings.catch_warnings(record=True) as w:
+                warnings.filterwarnings('always', '',
+                                        np.VisibleDeprecationWarning)
+                test = np.genfromtxt(path, dtype=None, comments=None,
+                                     delimiter=',')
+                # Check for warning when encoding not specified.
+                assert_(w[0].category is np.VisibleDeprecationWarning)
+            ctl = np.array([
+                     ["norm1", "norm2", "norm3"],
+                     ["norm1", latin1, "norm3"],
+                     ["test1", "testNonethe" + utf8, "test3"]],
+                     dtype=np.unicode)
+            assert_array_equal(test, ctl)
+
     def test_recfromtxt(self):
         #
         data = TextIO('A,B\n0,1\n2,3')
@@ -1793,11 +2114,7 @@ M   33  21.99
         # Test that we can load data from a filename as well as a file
         # object
         tgt = np.arange(6).reshape((2, 3))
-        if sys.version_info[0] >= 3:
-            # python 3k is known to fail for '\r'
-            linesep = ('\n', '\r\n')
-        else:
-            linesep = ('\n', '\r\n', '\r')
+        linesep = ('\n', '\r\n', '\r')
 
         for sep in linesep:
             data = '0 1 2' + sep + '3 4 5'
@@ -1807,6 +2124,22 @@ M   33  21.99
                 res = np.genfromtxt(name)
             assert_array_equal(res, tgt)
 
+    def test_gft_from_gzip(self):
+        # Test that we can load data from a gzipped file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            s = BytesIO()
+            with gzip.GzipFile(fileobj=s, mode='w') as g:
+                g.write(asbytes(data))
+
+            with temppath(suffix='.gz2') as name:
+                with open(name, 'w') as f:
+                    f.write(data)
+                assert_array_equal(np.genfromtxt(name), wanted)
+
     def test_gft_using_generator(self):
         # gft doesn't work with unicode.
         def count():
@@ -1844,7 +2177,7 @@ M   33  21.99
 
 class TestPathUsage(object):
     # Test that pathlib.Path can be used
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_loadtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -1853,7 +2186,7 @@ class TestPathUsage(object):
             x = np.loadtxt(path)
             assert_array_equal(x, a)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_save_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npy') as path:
@@ -1863,7 +2196,7 @@ class TestPathUsage(object):
             data = np.load(path)
             assert_array_equal(data, a)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_savez_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npz') as path:
@@ -1872,7 +2205,7 @@ class TestPathUsage(object):
             with np.load(path) as data:
                 assert_array_equal(data['lab'], 'place holder')
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_savez_compressed_load(self):
         # Test that pathlib.Path instances can be used with savez.
         with temppath(suffix='.npz') as path:
@@ -1882,7 +2215,7 @@ class TestPathUsage(object):
             assert_array_equal(data['lab'], 'place holder')
             data.close()
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_genfromtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -1891,7 +2224,7 @@ class TestPathUsage(object):
             data = np.genfromtxt(path)
             assert_array_equal(a, data)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_ndfromtxt(self):
         # Test outputing a standard ndarray
         with temppath(suffix='.txt') as path:
@@ -1903,7 +2236,7 @@ class TestPathUsage(object):
             test = np.ndfromtxt(path, dtype=int)
             assert_array_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_mafromtxt(self):
         # From `test_fancy_dtype_alt` above
         with temppath(suffix='.txt') as path:
@@ -1915,7 +2248,7 @@ class TestPathUsage(object):
             control = ma.array([(1.0, 2.0, 3.0), (4.0, 5.0, 6.0)])
             assert_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_recfromtxt(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
@@ -1929,7 +2262,7 @@ class TestPathUsage(object):
             assert_(isinstance(test, np.recarray))
             assert_equal(test, control)
 
-    @np.testing.dec.skipif(Path is None, "No pathlib.Path")
+    @dec.skipif(Path is None, "No pathlib.Path")
     def test_recfromcsv(self):
         with temppath(suffix='.txt') as path:
             path = Path(path)
author	Charles Harris <charlesr.harris@gmail.com>	2017-11-26 10:30:17 -0700
committer	GitHub <noreply@github.com>	2017-11-26 10:30:17 -0700
commit	8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4 (patch)
tree	e8ea159c9d9d3eed3bedc0ea702732119dc0b5c4 /numpy
parent	b13c70551f1ebc5222e9b11da61e2b886fbbef5e (diff)
parent	1d97b3aafdca2722bbe2f0c10a96544121c8f78b (diff)
download	numpy-8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4.tar.gz