diff options
author | pierregm <pierregm@localhost> | 2009-01-19 21:22:52 +0000 |
---|---|---|
committer | pierregm <pierregm@localhost> | 2009-01-19 21:22:52 +0000 |
commit | 8bd6c70d47e16fd81c8e3aefd4b2ec6dd90f38d6 (patch) | |
tree | d3042373968e32f0daffa3cdef6049d7c34b029a /numpy/lib/_iotools.py | |
parent | 065626fa64df3eb51f6cb5eafcb466818ebb621d (diff) | |
download | numpy-8bd6c70d47e16fd81c8e3aefd4b2ec6dd90f38d6.tar.gz |
* lib : introduced _iotools
* lib.io : introduced genfromtxt, ndfromtxt, mafromtxt, recfromtxt, recfromcsv.
Diffstat (limited to 'numpy/lib/_iotools.py')
-rw-r--r-- | numpy/lib/_iotools.py | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py new file mode 100644 index 000000000..3f3c6655e --- /dev/null +++ b/numpy/lib/_iotools.py @@ -0,0 +1,469 @@ +""" +A collection of functions designed to help I/O with ascii file. + +""" +__docformat__ = "restructuredtext en" + +import numpy as np +import numpy.core.numeric as nx +from __builtin__ import bool, int, long, float, complex, object, unicode, str + + +def _is_string_like(obj): + """ + Check whether obj behaves like a string. + """ + try: + obj + '' + except (TypeError, ValueError): + return False + return True + + +def _to_filehandle(fname, flag='r', return_opened=False): + """ + Returns the filehandle corresponding to a string or a file. + If the string ends in '.gz', the file is automatically unzipped. + + Parameters + ---------- + fname : string, filehandle + Name of the file whose filehandle must be returned. + flag : string, optional + Flag indicating the status of the file ('r' for read, 'w' for write). + return_opened : boolean, optional + Whether to return the opening status of the file. + """ + if _is_string_like(fname): + if fname.endswith('.gz'): + import gzip + fhd = gzip.open(fname, flag) + elif fname.endswith('.bz2'): + import bz2 + fhd = bz2.BZ2File(fname) + else: + fhd = file(fname, flag) + opened = True + elif hasattr(fname, 'seek'): + fhd = fname + opened = False + else: + raise ValueError('fname must be a string or file handle') + if return_opened: + return fhd, opened + return fhd + + +def flatten_dtype(ndtype): + """ + Unpack a structured data-type. + + """ + names = ndtype.names + if names is None: + return [ndtype] + else: + types = [] + for field in names: + (typ, _) = ndtype.fields[field] + flat_dt = flatten_dtype(typ) + types.extend(flat_dt) + return types + + + +class LineSplitter: + """ + Defines a function to split a string at a given delimiter or at given places. + + Parameters + ---------- + comment : {'#', string} + Character used to mark the beginning of a comment. + delimiter : var, optional + If a string, character used to delimit consecutive fields. + If an integer or a sequence of integers, width(s) of each field. + autostrip : boolean, optional + Whether to strip each individual fields + """ + + def autostrip(self, method): + "Wrapper to strip each member of the output of `method`." + return lambda input: [_.strip() for _ in method(input)] + # + def __init__(self, delimiter=None, comments='#', autostrip=True): + self.comments = comments + # Delimiter is a character + if (delimiter is None) or _is_string_like(delimiter): + delimiter = delimiter or None + _handyman = self._delimited_splitter + # Delimiter is a list of field widths + elif hasattr(delimiter, '__iter__'): + _handyman = self._variablewidth_splitter + idx = np.cumsum([0]+list(delimiter)) + delimiter = [slice(i,j) for (i,j) in zip(idx[:-1], idx[1:])] + # Delimiter is a single integer + elif int(delimiter): + (_handyman, delimiter) = (self._fixedwidth_splitter, int(delimiter)) + else: + (_handyman, delimiter) = (self._delimited_splitter, None) + self.delimiter = delimiter + if autostrip: + self._handyman = self.autostrip(_handyman) + else: + self._handyman = _handyman + # + def _delimited_splitter(self, line): + line = line.split(self.comments)[0].strip() + if not line: + return [] + return line.split(self.delimiter) + # + def _fixedwidth_splitter(self, line): + line = line.split(self.comments)[0] + if not line: + return [] + fixed = self.delimiter + slices = [slice(i, i+fixed) for i in range(len(line))[::fixed]] + return [line[s] for s in slices] + # + def _variablewidth_splitter(self, line): + line = line.split(self.comments)[0] + if not line: + return [] + slices = self.delimiter + return [line[s] for s in slices] + # + def __call__(self, line): + return self._handyman(line) + + + +class NameValidator: + """ + Validates a list of strings to use as field names. + The strings are stripped of any non alphanumeric character, and spaces + are replaced by `_`. If the optional input parameter `case_sensitive` + is False, the strings are set to upper case. + + During instantiation, the user can define a list of names to exclude, as + well as a list of invalid characters. Names in the exclusion list + are appended a '_' character. + + Once an instance has been created, it can be called with a list of names + and a list of valid names will be created. + The `__call__` method accepts an optional keyword, `default`, that sets + the default name in case of ambiguity. By default, `default = 'f'`, so + that names will default to `f0`, `f1` + + Parameters + ---------- + excludelist : sequence, optional + A list of names to exclude. This list is appended to the default list + ['return','file','print']. Excluded names are appended an underscore: + for example, `file` would become `file_`. + deletechars : string, optional + A string combining invalid characters that must be deleted from the names. + casesensitive : {True, False, 'upper', 'lower'}, optional + If True, field names are case_sensitive. + If False or 'upper', field names are converted to upper case. + If 'lower', field names are converted to lower case. + """ + # + defaultexcludelist = ['return','file','print'] + defaultdeletechars = set("""~!@#$%^&*()-=+~\|]}[{';: /?.>,<""") + # + def __init__(self, excludelist=None, deletechars=None, case_sensitive=None): + # + if excludelist is None: + excludelist = [] + excludelist.extend(self.defaultexcludelist) + self.excludelist = excludelist + # + if deletechars is None: + delete = self.defaultdeletechars + else: + delete = set(deletechars) + delete.add('"') + self.deletechars = delete + + if (case_sensitive is None) or (case_sensitive is True): + self.case_converter = lambda x: x + elif (case_sensitive is False) or ('u' in case_sensitive): + self.case_converter = lambda x: x.upper() + elif 'l' in case_sensitive: + self.case_converter = lambda x: x.lower() + else: + self.case_converter = lambda x: x + # + def validate(self, names, default='f'): + # + if names is None: + return + # + validatednames = [] + seen = dict() + # + deletechars = self.deletechars + excludelist = self.excludelist + # + case_converter = self.case_converter + # + for i, item in enumerate(names): + item = case_converter(item) + item = item.strip().replace(' ', '_') + item = ''.join([c for c in item if c not in deletechars]) + if not len(item): + item = '%s%d' % (default, i) + elif item in excludelist: + item += '_' + cnt = seen.get(item, 0) + if cnt > 0: + validatednames.append(item + '_%d' % cnt) + else: + validatednames.append(item) + seen[item] = cnt+1 + return validatednames + # + def __call__(self, names, default='f'): + return self.validate(names, default) + + + +def str2bool(value): + """ + Tries to transform a string supposed to represent a boolean to a boolean. + + Raises + ------ + ValueError + If the string is not 'True' or 'False' (case independent) + """ + value = value.upper() + if value == 'TRUE': + return True + elif value == 'FALSE': + return False + else: + raise ValueError("Invalid boolean") + + + +class StringConverter: + """ + Factory class for function transforming a string into another object (int, + float). + + After initialization, an instance can be called to transform a string + into another object. If the string is recognized as representing a missing + value, a default value is returned. + + Parameters + ---------- + dtype_or_func : {None, dtype, function}, optional + Input data type, used to define a basic function and a default value + for missing data. For example, when `dtype` is float, the :attr:`func` + attribute is set to ``float`` and the default value to `np.nan`. + Alternatively, function used to convert a string to another object. + In that later case, it is recommended to give an associated default + value as input. + default : {None, var}, optional + Value to return by default, that is, when the string to be converted + is flagged as missing. + missing_values : {sequence}, optional + Sequence of strings indicating a missing value. + locked : {boolean}, optional + Whether the StringConverter should be locked to prevent automatic + upgrade or not. + + Attributes + ---------- + func : function + Function used for the conversion + default : var + Default value to return when the input corresponds to a missing value. + type : type + Type of the output. + _status : integer + Integer representing the order of the conversion. + _mapper : sequence of tuples + Sequence of tuples (dtype, function, default value) to evaluate in order. + _locked : boolean + Whether the StringConverter is locked, thereby preventing automatic any + upgrade or not. + + """ + # + _mapper = [(nx.bool_, str2bool, None), + (nx.integer, int, -1), + (nx.floating, float, nx.nan), + (complex, complex, nx.nan+0j), + (nx.string_, str, '???')] + (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper) + # + @classmethod + def _getsubdtype(cls, val): + """Returns the type of the dtype of the input variable.""" + return np.array(val).dtype.type + # + @classmethod + def upgrade_mapper(cls, func, default=None): + """ + Upgrade the mapper of a StringConverter by adding a new function and its + corresponding default. + + The input function (or sequence of functions) and its associated default + value (if any) is inserted in penultimate position of the mapper. + The corresponding type is estimated from the dtype of the default value. + + Parameters + ---------- + func : var + Function, or sequence of functions + + Examples + -------- + >>> import dateutil.parser + >>> import datetime + >>> dateparser = datetutil.parser.parse + >>> defaultdate = datetime.date(2000, 1, 1) + >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate) + """ + # Func is a single functions + if hasattr(func, '__call__'): + cls._mapper.insert(-1, (cls._getsubdtype(default), func, default)) + return + elif hasattr(func, '__iter__'): + if isinstance(func[0], (tuple, list)): + for _ in func: + cls._mapper.insert(-1, _) + return + if default is None: + default = [None] * len(func) + else: + default = list(default) + default.append([None] * (len(func)-len(default))) + for (fct, dft) in zip(func, default): + cls._mapper.insert(-1, (cls._getsubdtype(dft), fct, dft)) + # + def __init__(self, dtype_or_func=None, default=None, missing_values=None, + locked=False): + # Defines a lock for upgrade + self._locked = bool(locked) + # No input dtype: minimal initialization + if dtype_or_func is None: + self.func = str2bool + self._status = 0 + self.default = default + ttype = np.bool + else: + # Is the input a np.dtype ? + try: + self.func = None + ttype = np.dtype(dtype_or_func).type + except TypeError: + # dtype_or_func must be a function, then + if not hasattr(dtype_or_func, '__call__'): + errmsg = "The input argument `dtype` is neither a function"\ + " or a dtype (got '%s' instead)" + raise TypeError(errmsg % type(dtype_or_func)) + # Set the function + self.func = dtype_or_func + # If we don't have a default, try to guess it or set it to None + if default is None: + try: + default = self.func('0') + except ValueError: + default = None + ttype = self._getsubdtype(default) + # Set the status according to the dtype + for (i, (deftype, func, default_def)) in enumerate(self._mapper): + if np.issubdtype(ttype, deftype): + self._status = i + self.default = default or default_def + break + # If the input was a dtype, set the function to the last we saw + if self.func is None: + self.func = func + # If the status is 1 (int), change the function to smthg more robust + if self.func == self._mapper[1][1]: + self.func = lambda x : int(float(x)) + # Store the list of strings corresponding to missing values. + if missing_values is None: + self.missing_values = set(['']) + else: + self.missing_values = set(list(missing_values) + ['']) + # + self._callingfunction = self._strict_call + self.type = ttype + # + def _loose_call(self, value): + try: + return self.func(value) + except ValueError: + return self.default + # + def _strict_call(self, value): + try: + return self.func(value) + except ValueError: + if value.strip() in self.missing_values: + return self.default + raise ValueError("Cannot convert string '%s'" % value) + # + def __call__(self, value): + return self._callingfunction(value) + # + def upgrade(self, value): + """ + Tries to find the best converter for `value`, by testing different + converters in order. + The order in which the converters are tested is read from the + :attr:`_status` attribute of the instance. + """ + try: + self._strict_call(value) + except ValueError: + # Raise an exception if we locked the converter... + if self._locked: + raise ValueError("Converter is locked and cannot be upgraded") + _statusmax = len(self._mapper) + # Complains if we try to upgrade by the maximum + if self._status == _statusmax: + raise ValueError("Could not find a valid conversion function") + elif self._status < _statusmax - 1: + self._status += 1 + (self.type, self.func, self.default) = self._mapper[self._status] + self.upgrade(value) + # + def update(self, func, default=None, missing_values='', locked=False): + """ + Sets the :attr:`func` and :attr:`default` attributes directly. + + Parameters + ---------- + func : function + Conversion function. + default : {var}, optional + Default value to return when a missing value is encountered. + missing_values : {var}, optional + Sequence of strings representing missing values. + locked : {False, True}, optional + Whether the status should be locked to prevent automatic upgrade. + """ + self.func = func + self._locked = locked + # Don't reset the default to None if we can avoid it + if default is not None: + self.default = default + # Add the missing values to the existing set + if missing_values is not None: + if _is_string_like(missing_values): + self.missing_values.add(missing_values) + elif hasattr(missing_values, '__iter__'): + for val in missing_values: + self.missing_values.add(val) + else: + self.missing_values = [] # Update the type + self.type = self._getsubdtype(func('0')) + |