summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2017-11-26 10:30:17 -0700
committerGitHub <noreply@github.com>2017-11-26 10:30:17 -0700
commit8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4 (patch)
treee8ea159c9d9d3eed3bedc0ea702732119dc0b5c4 /numpy/lib/npyio.py
parentb13c70551f1ebc5222e9b11da61e2b886fbbef5e (diff)
parent1d97b3aafdca2722bbe2f0c10a96544121c8f78b (diff)
downloadnumpy-8c441fae4bf1b101444d06fc5dcfdd29e66a9ba4.tar.gz
Merge pull request #10054 from charris/gh-4208
ENH: Add encoding option to numpy text IO.
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py392
1 files changed, 275 insertions, 117 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6de5940d7..e4d827334 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1,5 +1,6 @@
from __future__ import division, absolute_import, print_function
+import io
import sys
import os
import re
@@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
ConverterLockError, ConversionWarning, _is_string_like,
- has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name
+ has_nested_fields, flatten_dtype, easy_dtype, _decode_line
)
from numpy.compat import (
- asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path
+ asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode,
+ is_pathlib_path
)
if sys.version_info[0] >= 3:
@@ -294,7 +296,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
used in Python 3.
encoding : str, optional
What encoding to use when reading Python 2 strings. Only useful when
- loading Python 2 generated pickled files on Python 3, which includes
+ loading Python 2 generated pickled files in Python 3, which includes
npy/npz files containing object arrays. Values other than 'latin1',
'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
data. Default: 'ASCII'
@@ -731,8 +733,8 @@ def _getconv(dtype):
def floatconv(x):
x.lower()
- if b'0x' in x:
- return float.fromhex(asstr(x))
+ if '0x' in x:
+ return float.fromhex(x)
return float(x)
typ = dtype.type
@@ -752,13 +754,17 @@ def _getconv(dtype):
return lambda x: complex(asstr(x))
elif issubclass(typ, np.bytes_):
return asbytes
+ elif issubclass(typ, np.unicode_):
+ return asunicode
else:
return asstr
+# amount of lines loadtxt reads in one chunk, can be overriden for testing
+_loadtxt_chunksize = 50000
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
converters=None, skiprows=0, usecols=None, unpack=False,
- ndmin=0):
+ ndmin=0, encoding='bytes'):
"""
Load data from a text file.
@@ -776,13 +782,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
each row will be interpreted as an element of the array. In this
case, the number of columns used must match the number of fields in
the data-type.
- comments : str or sequence, optional
+ comments : str or sequence of str, optional
The characters or list of characters used to indicate the start of a
- comment;
- default: '#'.
+ comment. For backwards compatibility, byte strings will be decoded as
+ 'latin1'. The default is '#'.
delimiter : str, optional
- The string used to separate values. By default, this is any
- whitespace.
+ The string used to separate values. For backwards compatibility, byte
+ strings will be decoded as 'latin1'. The default is whitespace.
converters : dict, optional
A dictionary mapping column number to a function that will convert
that column to a float. E.g., if column 0 is a date string:
@@ -791,18 +797,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None.
skiprows : int, optional
Skip the first `skiprows` lines; default: 0.
-
usecols : int or sequence, optional
Which columns to read, with 0 being the first. For example,
usecols = (1,4,5) will extract the 2nd, 5th and 6th columns.
The default, None, results in all columns being read.
- .. versionadded:: 1.11.0
-
- Also when a single column has to be read it is possible to use
- an integer instead of a tuple. E.g ``usecols = 3`` reads the
- fourth column the same way as `usecols = (3,)`` would.
-
+ .. versionchanged:: 1.11.0
+ When a single column has to be read it is possible to use
+ an integer instead of a tuple. E.g ``usecols = 3`` reads the
+ fourth column the same way as `usecols = (3,)`` would.
unpack : bool, optional
If True, the returned array is transposed, so that arguments may be
unpacked using ``x, y, z = loadtxt(...)``. When used with a structured
@@ -813,6 +816,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
Legal values: 0 (default), 1 or 2.
.. versionadded:: 1.6.0
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply to input streams.
+ The special value 'bytes' enables backward compatibility workarounds
+ that ensures you receive byte arrays as results if possible and passes
+ latin1 encoded strings to converters. Override this value to receive
+ unicode arrays and pass strings as input to converters. If set to None
+ the system default is used. The default value is 'bytes'.
+
+ .. versionadded:: 1.14.0
Returns
-------
@@ -861,16 +873,22 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
# Type conversions for Py3 convenience
if comments is not None:
if isinstance(comments, (basestring, bytes)):
- comments = [asbytes(comments)]
- else:
- comments = [asbytes(comment) for comment in comments]
-
+ comments = [comments]
+ comments = [_decode_line(x) for x in comments]
# Compile regex for comments beforehand
comments = (re.escape(comment) for comment in comments)
- regex_comments = re.compile(b'|'.join(comments))
- user_converters = converters
+ regex_comments = re.compile('|'.join(comments))
+
if delimiter is not None:
- delimiter = asbytes(delimiter)
+ delimiter = _decode_line(delimiter)
+
+ user_converters = converters
+
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+ else:
+ byte_converters = False
if usecols is not None:
# Allow usecols to be a single int or a sequence of ints
@@ -896,22 +914,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
if is_pathlib_path(fname):
fname = str(fname)
if _is_string_like(fname):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ fencoding = getattr(fh, 'encoding', 'latin1')
+ fh = iter(fh)
fown = True
- if fname.endswith('.gz'):
- import gzip
- fh = iter(gzip.GzipFile(fname))
- elif fname.endswith('.bz2'):
- import bz2
- fh = iter(bz2.BZ2File(fname))
- elif sys.version_info[0] == 2:
- fh = iter(open(fname, 'U'))
- else:
- fh = iter(open(fname))
else:
fh = iter(fname)
+ fencoding = getattr(fname, 'encoding', 'latin1')
except TypeError:
raise ValueError('fname must be a string, file handle, or generator')
- X = []
+
+ # input may be a python2 io stream
+ if encoding is not None:
+ fencoding = encoding
+ # we must assume local encoding
+ # TOOD emit portability warning?
+ elif fencoding is None:
+ import locale
+ fencoding = locale.getpreferredencoding()
# not to be confused with the flatten_dtype we import...
def flatten_dtype_internal(dt):
@@ -960,21 +980,53 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
return tuple(ret)
def split_line(line):
- """Chop off comments, strip, and split at delimiter.
+ """Chop off comments, strip, and split at delimiter. """
+ line = _decode_line(line, encoding=encoding)
- Note that although the file is opened as text, this function
- returns bytes.
-
- """
- line = asbytes(line)
if comments is not None:
- line = regex_comments.split(asbytes(line), maxsplit=1)[0]
- line = line.strip(b'\r\n')
+ line = regex_comments.split(line, maxsplit=1)[0]
+ line = line.strip('\r\n')
if line:
return line.split(delimiter)
else:
return []
+ def read_data(chunk_size):
+ """Parse each line, including the first.
+
+ The file read, `fh`, is a global defined above.
+
+ Parameters
+ ----------
+ chunk_size : int
+ At most `chunk_size` lines are read at a time, with iteration
+ until all lines are read.
+
+ """
+ X = []
+ for i, line in enumerate(itertools.chain([first_line], fh)):
+ vals = split_line(line)
+ if len(vals) == 0:
+ continue
+ if usecols:
+ vals = [vals[j] for j in usecols]
+ if len(vals) != N:
+ line_num = i + skiprows + 1
+ raise ValueError("Wrong number of columns at line %d"
+ % line_num)
+
+ # Convert each value according to its column and store
+ items = [conv(val) for (conv, val) in zip(converters, vals)]
+
+ # Then pack it according to the dtype's nesting
+ items = pack_items(items, packing)
+ X.append(items)
+ if len(X) > chunk_size:
+ yield X
+ X = []
+ if X:
+ yield X
+
try:
# Make sure we're dealing with a proper dtype
dtype = np.dtype(dtype)
@@ -1017,30 +1069,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
except ValueError:
# Unused converter specified
continue
- converters[i] = conv
-
- # Parse each line, including the first
- for i, line in enumerate(itertools.chain([first_line], fh)):
- vals = split_line(line)
- if len(vals) == 0:
- continue
- if usecols:
- vals = [vals[j] for j in usecols]
- if len(vals) != N:
- line_num = i + skiprows + 1
- raise ValueError("Wrong number of columns at line %d"
- % line_num)
-
- # Convert each value according to its column and store
- items = [conv(val) for (conv, val) in zip(converters, vals)]
- # Then pack it according to the dtype's nesting
- items = pack_items(items, packing)
- X.append(items)
+ if byte_converters:
+ # converters may use decode to workaround numpy's old behaviour,
+ # so encode the string again before passing to the user converter
+ def tobytes_first(x, conv):
+ if type(x) is bytes:
+ return conv(x)
+ return conv(x.encode("latin1"))
+ import functools
+ converters[i] = functools.partial(tobytes_first, conv=conv)
+ else:
+ converters[i] = conv
+
+ converters = [conv if conv is not bytes else
+ lambda x: x.encode(fencoding) for conv in converters]
+
+ # read data in chunks and fill it into an array via resize
+ # over-allocating and shrinking the array later may be faster but is
+ # probably not relevant compared to the cost of actually reading and
+ # converting the data
+ X = None
+ for x in read_data(_loadtxt_chunksize):
+ if X is None:
+ X = np.array(x, dtype)
+ else:
+ nshape = list(X.shape)
+ pos = nshape[0]
+ nshape[0] += len(x)
+ X.resize(nshape)
+ X[pos:, ...] = x
finally:
if fown:
fh.close()
- X = np.array(X, dtype)
+ if X is None:
+ X = np.array([], dtype)
+
# Multicolumn data are returned with shape (1, N, M), i.e.
# (1, 1, M) for a single row - remove the singleton dimension there
if X.ndim == 3 and X.shape[:2] == (1, 1):
@@ -1072,7 +1136,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
- footer='', comments='# '):
+ footer='', comments='# ', encoding=None):
"""
Save an array to a text file.
@@ -1116,6 +1180,13 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
``numpy.loadtxt``.
.. versionadded:: 1.7.0
+ encoding : {None, str}, optional
+ Encoding used to encode the outputfile. Does not apply to output
+ streams. If the encoding is something other than 'bytes' or 'latin1'
+ you will not be able to load the file in NumPy versions < 1.14. Default
+ is 'latin1'.
+
+ .. versionadded:: 1.14.0
See Also
@@ -1190,21 +1261,53 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
fmt = asstr(fmt)
delimiter = asstr(delimiter)
+ class WriteWrap(object):
+ """Convert to unicode in py2 or to bytes on bytestream inputs.
+
+ """
+ def __init__(self, fh, encoding):
+ self.fh = fh
+ self.encoding = encoding
+ self.do_write = self.first_write
+
+ def close(self):
+ self.fh.close()
+
+ def write(self, v):
+ self.do_write(v)
+
+ def write_bytes(self, v):
+ if isinstance(v, bytes):
+ self.fh.write(v)
+ else:
+ self.fh.write(v.encode(self.encoding))
+
+ def write_normal(self, v):
+ self.fh.write(asunicode(v))
+
+ def first_write(self, v):
+ try:
+ self.write_normal(v)
+ self.write = self.write_normal
+ except TypeError:
+ # input is probably a bytestream
+ self.write_bytes(v)
+ self.write = self.write_bytes
+
own_fh = False
if is_pathlib_path(fname):
fname = str(fname)
if _is_string_like(fname):
+ # datasource doesn't support creating a new file ...
+ open(fname, 'wt').close()
+ fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
own_fh = True
- if fname.endswith('.gz'):
- import gzip
- fh = gzip.open(fname, 'wb')
- else:
- if sys.version_info[0] >= 3:
- fh = open(fname, 'wb')
- else:
- fh = open(fname, 'w')
+ # need to convert str to unicode for text io output
+ if sys.version_info[0] == 2:
+ fh = WriteWrap(fh, encoding or 'latin1')
elif hasattr(fname, 'write'):
- fh = fname
+ # wrap to handle byte output streams
+ fh = WriteWrap(fname, encoding or 'latin1')
else:
raise ValueError('fname must be a string or file handle')
@@ -1254,31 +1357,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
if len(header) > 0:
header = header.replace('\n', '\n' + comments)
- fh.write(asbytes(comments + header + newline))
+ fh.write(comments + header + newline)
if iscomplex_X:
for row in X:
row2 = []
for number in row:
row2.append(number.real)
row2.append(number.imag)
- fh.write(asbytes(format % tuple(row2) + newline))
+ fh.write(format % tuple(row2) + newline)
else:
for row in X:
try:
- fh.write(asbytes(format % tuple(row) + newline))
+ v = format % tuple(row) + newline
except TypeError:
raise TypeError("Mismatch between array dtype ('%s') and "
"format specifier ('%s')"
% (str(X.dtype), format))
+ fh.write(v)
+
if len(footer) > 0:
footer = footer.replace('\n', '\n' + comments)
- fh.write(asbytes(comments + footer + newline))
+ fh.write(comments + footer + newline)
finally:
if own_fh:
fh.close()
-def fromregex(file, regexp, dtype):
+def fromregex(file, regexp, dtype, encoding=None):
"""
Construct an array from a text file, using regular expression parsing.
@@ -1295,6 +1400,10 @@ def fromregex(file, regexp, dtype):
Groups in the regular expression correspond to fields in the dtype.
dtype : dtype or list of dtypes
Dtype for the structured array.
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply to input streams.
+
+ .. versionadded:: 1.14.0
Returns
-------
@@ -1335,16 +1444,22 @@ def fromregex(file, regexp, dtype):
"""
own_fh = False
if not hasattr(file, "read"):
- file = open(file, 'rb')
+ file = np.lib._datasource.open(file, 'rt', encoding=encoding)
own_fh = True
try:
- if not hasattr(regexp, 'match'):
- regexp = re.compile(asbytes(regexp))
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
- seq = regexp.findall(file.read())
+ content = file.read()
+ if isinstance(content, bytes) and not isinstance(regexp, bytes):
+ regexp = asbytes(regexp)
+ elif not isinstance(content, bytes) and isinstance(regexp, bytes):
+ regexp = asstr(regexp)
+
+ if not hasattr(regexp, 'match'):
+ regexp = re.compile(regexp)
+ seq = regexp.findall(content)
if seq and not isinstance(seq[0], tuple):
# Only one group is in the regexp.
# Create the new array as a single data-type and then
@@ -1372,7 +1487,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
names=None, excludelist=None, deletechars=None,
replace_space='_', autostrip=False, case_sensitive=True,
defaultfmt="f%i", unpack=None, usemask=False, loose=True,
- invalid_raise=True, max_rows=None):
+ invalid_raise=True, max_rows=None, encoding='bytes'):
"""
Load data from a text file, with missing values handled as specified.
@@ -1460,6 +1575,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
to read the entire file.
.. versionadded:: 1.10.0
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply when `fname` is
+ a file object. The special value 'bytes' enables backward compatibility
+ workarounds that ensure that you receive byte arrays when possible
+ and passes latin1 encoded strings to converters. Override this value to
+ receive unicode arrays and pass strings as input to converters. If set
+ to None the system default is used. The default value is 'bytes'.
+
+ .. versionadded:: 1.14.0
Returns
-------
@@ -1536,15 +1660,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
if max_rows < 1:
raise ValueError("'max_rows' must be at least 1.")
- # Py3 data conversions to bytes, for convenience
- if comments is not None:
- comments = asbytes(comments)
- if isinstance(delimiter, unicode):
- delimiter = asbytes(delimiter)
- if isinstance(missing_values, (unicode, list, tuple)):
- missing_values = asbytes_nested(missing_values)
-
- #
if usemask:
from numpy.ma import MaskedArray, make_mask_descr
# Check the input dictionary of converters
@@ -1554,16 +1669,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
"The input argument 'converter' should be a valid dictionary "
"(got '%s' instead)" % type(user_converters))
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+ else:
+ byte_converters = False
+
# Initialize the filehandle, the LineSplitter and the NameValidator
own_fhd = False
try:
if is_pathlib_path(fname):
fname = str(fname)
if isinstance(fname, basestring):
- if sys.version_info[0] == 2:
- fhd = iter(np.lib._datasource.open(fname, 'rbU'))
- else:
- fhd = iter(np.lib._datasource.open(fname, 'rb'))
+ fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding))
own_fhd = True
else:
fhd = iter(fname)
@@ -1573,7 +1691,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
"or generator. Got %s instead." % type(fname))
split_line = LineSplitter(delimiter=delimiter, comments=comments,
- autostrip=autostrip)._handyman
+ autostrip=autostrip, encoding=encoding)
validate_names = NameValidator(excludelist=excludelist,
deletechars=deletechars,
case_sensitive=case_sensitive,
@@ -1587,15 +1705,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
first_values = None
try:
while not first_values:
- first_line = next(fhd)
+ first_line = _decode_line(next(fhd), encoding)
if names is True:
if comments in first_line:
first_line = (
- b''.join(first_line.split(comments)[1:]))
+ ''.join(first_line.split(comments)[1:]))
first_values = split_line(first_line)
except StopIteration:
# return an empty array if the datafile is empty
- first_line = b''
+ first_line = ''
first_values = []
warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
@@ -1618,9 +1736,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Check the names and overwrite the dtype.names if needed
if names is True:
- names = validate_names([_bytes_to_name(_.strip())
- for _ in first_values])
- first_line = b''
+ names = validate_names([str(_.strip()) for _ in first_values])
+ first_line = ''
elif _is_string_like(names):
names = validate_names([_.strip() for _ in names.split(',')])
elif names:
@@ -1657,9 +1774,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Process the missing values ...............................
# Rename missing_values for convenience
user_missing_values = missing_values or ()
+ if isinstance(user_missing_values, bytes):
+ user_missing_values = user_missing_values.decode('latin1')
# Define the list of missing_values (one column: one list)
- missing_values = [list([b'']) for _ in range(nbcols)]
+ missing_values = [list(['']) for _ in range(nbcols)]
# We have a dictionary: process it field by field
if isinstance(user_missing_values, dict):
@@ -1698,8 +1817,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
if value not in entry:
entry.append(value)
# We have a string : apply it to all entries
- elif isinstance(user_missing_values, bytes):
- user_value = user_missing_values.split(b",")
+ elif isinstance(user_missing_values, basestring):
+ user_value = user_missing_values.split(",")
for entry in missing_values:
entry.extend(user_value)
# We have something else: apply it to all entries
@@ -1787,16 +1906,29 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
testing_value = first_values[j]
else:
testing_value = None
- converters[i].update(conv, locked=True,
+ if conv is bytes:
+ user_conv = asbytes
+ elif byte_converters:
+ # converters may use decode to workaround numpy's old behaviour,
+ # so encode the string again before passing to the user converter
+ def tobytes_first(x, conv):
+ if type(x) is bytes:
+ return conv(x)
+ return conv(x.encode("latin1"))
+ import functools
+ user_conv = functools.partial(tobytes_first, conv=conv)
+ else:
+ user_conv = conv
+ converters[i].update(user_conv, locked=True,
testing_value=testing_value,
default=filling_values[i],
missing_values=missing_values[i],)
- uc_update.append((i, conv))
+ uc_update.append((i, user_conv))
# Make sure we have the corrected keys in user_converters...
user_converters.update(uc_update)
# Fixme: possible error as following variable never used.
- #miss_chars = [_.missing_values for _ in converters]
+ # miss_chars = [_.missing_values for _ in converters]
# Initialize the output lists ...
# ... rows
@@ -1908,16 +2040,42 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
column_types = [conv.type for conv in converters]
# Find the columns with strings...
strcolidx = [i for (i, v) in enumerate(column_types)
- if v in (type('S'), np.string_)]
+ if v == np.unicode_]
+
+ type_str = np.unicode_
+ if byte_converters and strcolidx:
+ # convert strings back to bytes for backward compatibility
+ warnings.warn(
+ "Reading unicode strings without specifying the encoding "
+ "argument is deprecated. Set the encoding, use None for the "
+ "system default.",
+ np.VisibleDeprecationWarning, stacklevel=2)
+ def encode_unicode_cols(row_tup):
+ row = list(row_tup)
+ for i in strcolidx:
+ row[i] = row[i].encode('latin1')
+ return tuple(row)
+
+ try:
+ data = [encode_unicode_cols(r) for r in data]
+ type_str = np.bytes_
+ except UnicodeEncodeError:
+ pass
+
+
# ... and take the largest number of chars.
for i in strcolidx:
- column_types[i] = "|S%i" % max(len(row[i]) for row in data)
+ max_line_length = max(len(row[i]) for row in data)
+ column_types[i] = np.dtype((type_str, max_line_length))
#
if names is None:
# If the dtype is uniform, don't define names, else use ''
base = set([c.type for c in converters if c._checked])
if len(base) == 1:
- (ddtype, mdtype) = (list(base)[0], bool)
+ if strcolidx:
+ (ddtype, mdtype) = (type_str, bool)
+ else:
+ (ddtype, mdtype) = (list(base)[0], bool)
else:
ddtype = [(defaultfmt % i, dt)
for (i, dt) in enumerate(column_types)]
@@ -1966,8 +2124,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Keep the dtype of the current converter
if i in user_converters:
ishomogeneous &= (ttype == dtype.type)
- if ttype == np.string_:
- ttype = "|S%i" % max(len(row[i]) for row in data)
+ if np.issubdtype(ttype, np.character):
+ ttype = (ttype, max(len(row[i]) for row in data))
descr.append(('', ttype))
else:
descr.append(('', dtype))
@@ -1990,9 +2148,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Try to take care of the missing data we missed
names = output.dtype.names
if usemask and names:
- for (name, conv) in zip(names or (), converters):
+ for (name, conv) in zip(names, converters):
missing_values = [conv(_) for _ in conv.missing_values
- if _ != b'']
+ if _ != '']
for mval in missing_values:
outputmask[name] |= (output[name] == mval)
# Construct the final array