diff options
author | Hernan Grecco <hernan.grecco@gmail.com> | 2014-07-08 11:43:08 -0300 |
---|---|---|
committer | Hernan Grecco <hernan.grecco@gmail.com> | 2014-07-08 11:43:08 -0300 |
commit | c7ae27ad8554479b8f8dcee900cb6a8a3c5ebb4a (patch) | |
tree | 462237134719ead2e1667dc7d9df3c80c0790e4d | |
parent | b683a79f7e1d0a70482e2395018b2ef5e538388f (diff) | |
download | pint-_npsubclass.tar.gz |
Added arrayterator_npsubclass
-rw-r--r-- | pint/helpers/__init__.py | 3 | ||||
-rw-r--r-- | pint/helpers/arrayterator.py | 225 |
2 files changed, 228 insertions, 0 deletions
diff --git a/pint/helpers/__init__.py b/pint/helpers/__init__.py new file mode 100644 index 0000000..a2bab42 --- /dev/null +++ b/pint/helpers/__init__.py @@ -0,0 +1,3 @@ + + +from .arrayterator import Arrayterator diff --git a/pint/helpers/arrayterator.py b/pint/helpers/arrayterator.py new file mode 100644 index 0000000..051e88f --- /dev/null +++ b/pint/helpers/arrayterator.py @@ -0,0 +1,225 @@ +""" +A buffered iterator for big arrays. + +This module solves the problem of iterating over a big file-based array +without having to read it into memory. The `Arrayterator` class wraps +an array object, and when iterated it will return sub-arrays with at most +a user-specified number of elements. + +""" +from __future__ import division, absolute_import, print_function + +import sys +from operator import mul +from functools import reduce + +from numpy.compat import long + +__all__ = ['Arrayterator'] + + +class Arrayterator(object): + """ + Buffered iterator for big arrays. + + `Arrayterator` creates a buffered iterator for reading big arrays in small + contiguous blocks. The class is useful for objects stored in the + file system. It allows iteration over the object *without* reading + everything in memory; instead, small blocks are read and iterated over. + + `Arrayterator` can be used with any object that supports multidimensional + slices. This includes NumPy arrays, but also variables from + Scientific.IO.NetCDF or pynetcdf for example. + + Parameters + ---------- + var : array_like + The object to iterate over. + buf_size : int, optional + The buffer size. If `buf_size` is supplied, the maximum amount of + data that will be read into memory is `buf_size` elements. + Default is None, which will read as many element as possible + into memory. + + Attributes + ---------- + var + buf_size + start + stop + step + shape + flat + + See Also + -------- + ndenumerate : Multidimensional array iterator. + flatiter : Flat array iterator. + memmap : Create a memory-map to an array stored in a binary file on disk. + + Notes + ----- + The algorithm works by first finding a "running dimension", along which + the blocks will be extracted. Given an array of dimensions + ``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the + first dimension will be used. If, on the other hand, + ``d1 < buf_size < d1*d2`` the second dimension will be used, and so on. + Blocks are extracted along this dimension, and when the last block is + returned the process continues from the next dimension, until all + elements have been read. + + Examples + -------- + >>> import numpy as np + >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) + >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2) + >>> a_itor.shape + (3, 4, 5, 6) + + Now we can iterate over ``a_itor``, and it will return arrays of size + two. Since `buf_size` was smaller than any dimension, the first + dimension will be iterated over first: + + >>> for subarr in a_itor: + ... if not subarr.all(): + ... print subarr, subarr.shape + ... + [[[[0 1]]]] (1, 1, 1, 2) + + """ + + def __init__(self, var, units, buf_size=None): + self.var = var + self.units = units + self.buf_size = buf_size + + self.start = [0 for dim in var.shape] + self.stop = [dim for dim in var.shape] + self.step = [1 for dim in var.shape] + + def __getattr__(self, attr): + return getattr(self.var, attr) + + def __getitem__(self, index): + """ + Return a new arrayterator. + + """ + # Fix index, handling ellipsis and incomplete slices. + if not isinstance(index, tuple): index = (index,) + fixed = [] + length, dims = len(index), len(self.shape) + for slice_ in index: + if slice_ is Ellipsis: + fixed.extend([slice(None)] * (dims-length+1)) + length = len(fixed) + elif isinstance(slice_, (int, long)): + fixed.append(slice(slice_, slice_+1, 1)) + else: + fixed.append(slice_) + index = tuple(fixed) + if len(index) < dims: + index += (slice(None),) * (dims-len(index)) + + # Return a new arrayterator object. + out = self.__class__(self.var, self.units, self.buf_size) + for i, (start, stop, step, slice_) in enumerate( + zip(self.start, self.stop, self.step, index)): + out.start[i] = start + (slice_.start or 0) + out.step[i] = step * (slice_.step or 1) + out.stop[i] = start + (slice_.stop or stop-start) + out.stop[i] = min(stop, out.stop[i]) + return out + + def __array__(self): + """ + Return corresponding data. + + """ + slice_ = tuple(slice(*t) for t in zip( + self.start, self.stop, self.step)) + return self.var[slice_] * self.units + + @property + def flat(self): + """ + A 1-D flat iterator for Arrayterator objects. + + This iterator returns elements of the array to be iterated over in + `Arrayterator` one by one. It is similar to `flatiter`. + + See Also + -------- + `Arrayterator` + flatiter + + Examples + -------- + >>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) + >>> a_itor = np.lib.arrayterator.Arrayterator(a, 2) + + >>> for subarr in a_itor.flat: + ... if not subarr: + ... print subarr, type(subarr) + ... + 0 <type 'numpy.int32'> + + """ + for block in self: + for value in block.flat: + yield value * self.units + + @property + def shape(self): + """ + The shape of the array to be iterated over. + + For an example, see `Arrayterator`. + + """ + return tuple(((stop-start-1)//step+1) for start, stop, step in + zip(self.start, self.stop, self.step)) + + def __iter__(self): + # Skip arrays with degenerate dimensions + if [dim for dim in self.shape if dim <= 0]: raise StopIteration + + start = self.start[:] + stop = self.stop[:] + step = self.step[:] + ndims = len(self.var.shape) + + while True: + count = self.buf_size or reduce(mul, self.shape) + + # iterate over each dimension, looking for the + # running dimension (ie, the dimension along which + # the blocks will be built from) + rundim = 0 + for i in range(ndims-1, -1, -1): + # if count is zero we ran out of elements to read + # along higher dimensions, so we read only a single position + if count == 0: + stop[i] = start[i]+1 + elif count <= self.shape[i]: # limit along this dimension + stop[i] = start[i] + count*step[i] + rundim = i + else: + stop[i] = self.stop[i] # read everything along this + # dimension + stop[i] = min(self.stop[i], stop[i]) + count = count//self.shape[i] + + # yield a block + slice_ = tuple(slice(*t) for t in zip(start, stop, step)) + yield self.var[slice_] * self.units + + # Update start position, taking care of overflow to + # other dimensions + start[rundim] = stop[rundim] # start where we stopped + for i in range(ndims-1, 0, -1): + if start[i] >= self.stop[i]: + start[i] = self.start[i] + start[i-1] += self.step[i-1] + if start[0] >= self.stop[0]: + raise StopIteration |