ENH: missingdata: Create count_reduce_items function

This function either cheaply returns the product of the sizes of all the reduction axes, or counts the number of items which will be used in a reduction operation when skipna is True. Its purpose is to make it easy to do functions like np.mean and np.std.
author: Mark Wiebe <mwwiebe@gmail.com> 2011-08-17 19:18:56 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 07:26:56 -0600
commit: bfda229ec93d37b1ee2cdd8b9443ec4e34536bbf (patch)
tree: 1315cce82b32c73eb9390e3fa46d8a8e0ddecebe
parent: d9b3f90de3213ece9a78b77088fdec17910e81d9 (diff)
download: numpy-bfda229ec93d37b1ee2cdd8b9443ec4e34536bbf.tar.gz
9 files changed, 339 insertions, 12 deletions
diff --git a/doc/source/reference/routines.sort.rst b/doc/source/reference/routines.sort.rst
index c10252c69..517ea5897 100644
--- a/doc/source/reference/routines.sort.rst
+++ b/doc/source/reference/routines.sort.rst
@@ -37,3 +37,4 @@ Counting
    :toctree: generated/
 
    count_nonzero
+   count_reduce_items
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 82f59c6b0..8596b9c9c 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -915,6 +915,56 @@ add_newdoc('numpy.core.multiarray', 'count_nonzero',
     5
     """)
 
+add_newdoc('numpy.core.multiarray', 'count_reduce_items',
+    """
+    count_reduce_items(arr, axis=None, skipna=False)
+
+    Counts the number of items a reduction with the same `axis`
+    and `skipna` parameter values would use. The purpose of this
+    function is for the creation of reduction operations
+    which use the item count, such as :func:`mean`.
+
+    When `skipna` is False or `arr` doesn't have an NA mask,
+    the result is simply the product of the reduction axis
+    sizes, returned as a single scalar.
+
+    Parameters
+    ----------
+    arr : array_like
+        The array for which to count the reduce items.
+    axis : None or int or tuple of ints, optional
+        Axis or axes along which a reduction is performed.
+        The default (`axis` = None) is perform a reduction over all
+        the dimensions of the input array.
+    skipna : bool, optional
+        If this is set to True, any NA elements in the array are not
+        counted. The only time this function does any actual counting
+        instead of a cheap multiply of a few sizes is when `skipna` is
+        true and `arr` has an NA mask.
+
+    Returns
+    -------
+    count : intp or array of intp
+        Number of items that would be used in a reduction with the
+        same `axis` and `skipna` parameter values.
+
+    Examples
+    --------
+    >>> a = np.array([[1,np.NA,1], [1,1,np.NA]])
+
+    >>> np.count_reduce_items(a)
+    6
+    >>> np.count_reduce_items(a, skipna=True)
+    4
+    >>> np.sum(a, skipna=True)
+    4
+
+    >>> np.count_reduce_items(a, axis=0, skipna=True)
+    array([2, 1, 1])
+    >>> np.sum(a, axis=0, skipna=True)
+    array([2, 1, 1])
+    """)
+
 add_newdoc('numpy.core.multiarray','set_typeDict',
     """set_typeDict(dict)
 
@@ -5280,7 +5330,7 @@ add_newdoc('numpy.core', 'ufunc', ('types',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None)
+    reduce(a, axis=0, dtype=None, out=None, skipna=False)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -5325,6 +5375,11 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     out : ndarray, optional
         A location into which the result is stored. If not provided, a
         freshly-allocated array is returned.
+    skipna : bool, optional
+        If this is set to True, the reduction is done as if any NA elements
+        were not counted in the array. The default, False, causes the
+        NA values to propagate, so if any element in a set of elements
+        being reduced is NA, the result will be NA.
 
     Returns
     -------
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index f963fa783..f2470a58f 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -334,7 +334,7 @@ multiarray_funcs_api = {
     'PyArray_AssignArray':                  295,
     'PyArray_CreateReduceResult':           296,
     'PyArray_InitializeReduceResult':       297,
-    'PyArray_RemoveAxesInPlace':            298
+    'PyArray_RemoveAxesInPlace':            298,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 9b7c20d76..00184505d 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1,7 +1,7 @@
 __all__ = ['newaxis', 'ndarray', 'flatiter', 'nditer', 'nested_iters', 'ufunc',
-           'arange', 'array', 'zeros', 'count_nonzero', 'empty', 'broadcast',
-           'dtype', 'fromstring', 'fromfile', 'frombuffer',
-           'int_asbuffer', 'where', 'argwhere', 'copyto',
+           'arange', 'array', 'zeros', 'count_nonzero', 'count_reduce_items',
+           'empty', 'broadcast', 'dtype', 'fromstring', 'fromfile',
+           'frombuffer', 'int_asbuffer', 'where', 'argwhere', 'copyto',
            'concatenate', 'fastCopyAndTranspose', 'lexsort', 'set_numeric_ops',
            'can_cast', 'promote_types', 'min_scalar_type', 'result_type',
            'asarray', 'asanyarray', 'ascontiguousarray', 'asfortranarray',
@@ -144,6 +144,7 @@ arange = multiarray.arange
 array = multiarray.array
 zeros = multiarray.zeros
 count_nonzero = multiarray.count_nonzero
+count_reduce_items = multiarray.count_reduce_items
 empty = multiarray.empty
 empty_like = multiarray.empty_like
 fromstring = multiarray.fromstring
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 88b81f43d..5257aec8e 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2009,7 +2009,7 @@ PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
                             &assign_reduce_unit_zero,
                             &reduce_count_nonzero_inner_loop,
                             &reduce_count_nonzero_masked_inner_loop,
-                            nonzero, "count_nonzero");
+                            nonzero, 0, "count_nonzero");
     Py_DECREF(dtype);
     if (out == NULL && result != NULL) {
         return PyArray_Return(result);
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index e3cb3709e..e21369e6f 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -54,6 +54,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "ctors.h"
 #include "na_singleton.h"
 #include "na_mask.h"
+#include "reduction.h"
 
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
@@ -2211,6 +2212,44 @@ array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
+array_count_reduce_items(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"arr", "axis", "skipna", NULL};
+
+    PyObject *array_in, *axis_in = NULL;
+    PyObject *ret = NULL;
+    PyArrayObject *array;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds,
+                                "O|Oi:count_reduce_items", kwlist,
+                                &array_in,
+                                &axis_in,
+                                &skipna)) {
+        return NULL;
+    }
+
+    array = (PyArrayObject *)PyArray_FromAny(array_in, NULL,
+                                        0, 0, NPY_ARRAY_ALLOWNA, NULL);
+    if (array == NULL) {
+        return NULL;
+    }
+
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        Py_DECREF(array);
+        return NULL;
+    }
+
+    ret = PyArray_CountReduceItems(array, axis_flags, skipna);
+
+    Py_DECREF(array);
+
+    return ret;
+}
+
+static PyObject *
 array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
 {
     char *data;
@@ -3756,6 +3795,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
         METH_VARARGS|METH_KEYWORDS, NULL},
+    {"count_reduce_items",
+        (PyCFunction)array_count_reduce_items,
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/reduction.c b/numpy/core/src/multiarray/reduction.c
index 30fb33bed..1866e3cba 100644
--- a/numpy/core/src/multiarray/reduction.c
+++ b/numpy/core/src/multiarray/reduction.c
@@ -17,6 +17,7 @@
 #include "npy_config.h"
 #include "numpy/npy_3kcompat.h"
 
+#include "lowlevel_strided_loops.h"
 #include "reduction.h"
 
 /*
@@ -418,6 +419,7 @@ PyArray_InitializeReduceResult(
  * inner_loop  : The inner loop which does the reduction.
  * masked_inner_loop: The inner loop which does the reduction with a mask.
  * data        : Data which is passed to assign_unit and the inner loop.
+ * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
  */
 NPY_NO_EXPORT PyArrayObject *
@@ -428,7 +430,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                         PyArray_AssignReduceUnitFunc *assign_unit,
                         PyArray_ReduceInnerLoopFunc *inner_loop,
                         PyArray_ReduceInnerLoopFunc *masked_inner_loop,
-                        void *data, const char *funcname)
+                        void *data, npy_intp buffersize, const char *funcname)
 {
     int use_maskna;
     PyArrayObject *result = NULL, *op_view = NULL;
@@ -549,10 +551,11 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         op_flags[1] |= NPY_ITER_IGNORE_MASKNA;
     }
 
-    iter = NpyIter_MultiNew(2, op, flags,
+    iter = NpyIter_AdvancedNew(2, op, flags,
                                NPY_KEEPORDER, NPY_SAME_KIND_CASTING,
                                op_flags,
-                               op_dtypes);
+                               op_dtypes,
+                               0, NULL, NULL, buffersize);
     if (iter == NULL) {
         Py_DECREF(result);
         Py_DECREF(op_dtypes[0]);
@@ -633,3 +636,138 @@ fail:
 
     return NULL;
 }
+
+/*
+ * This function counts the number of elements that a reduction
+ * will see along the reduction directions, given the provided options.
+ *
+ * If the reduction operand has no NA mask or 'skipna' is false, this
+ * is simply the prod`uct of all the reduction axis sizes. A NumPy
+ * scalar is returned in this case.
+ *
+ * If the reduction operand has an NA mask and 'skipna' is true, this
+ * counts the number of elements which are not NA along the reduction
+ * dimensions, and returns an array with the counts.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_CountReduceItems(PyArrayObject *operand,
+                            npy_bool *axis_flags, int skipna)
+{
+    int idim, ndim = PyArray_NDIM(operand);
+
+    /* The product of the reduction dimensions in this case */
+    if (!skipna || !PyArray_HASMASKNA(operand)) {
+        npy_intp count = 1, *shape = PyArray_SHAPE(operand);
+        PyArray_Descr *dtype;
+        PyObject *ret;
+
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                count *= shape[idim];
+            }
+        }
+
+        dtype = PyArray_DescrFromType(NPY_INTP);
+        if (dtype == NULL) {
+            return NULL;
+        }
+        ret = PyArray_Scalar(&count, dtype, NULL);
+        Py_DECREF(dtype);
+        return ret;
+    }
+    /* Otherwise we need to do a count based on the NA mask */
+    else {
+        npy_intp *strides;
+        PyArrayObject *result;
+        PyArray_Descr *result_dtype;
+
+        npy_intp i, coord[NPY_MAXDIMS];
+        npy_intp shape_it[NPY_MAXDIMS];
+        npy_intp operand_strides_it[NPY_MAXDIMS];
+        npy_intp result_strides_it[NPY_MAXDIMS];
+        char *operand_data = NULL, *result_data = NULL;
+
+        /*
+         * To support field-NA, we would create a result type
+         * with an INTP matching each field, then separately count
+         * the available elements per-field.
+         */
+        if (PyArray_HASFIELDS(operand)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "field-NA isn't implemented yet");
+            return NULL;
+        }
+
+        /*
+         * TODO: The loop below is specialized for NPY_BOOL masks,
+         *       will need another version for NPY_MASK masks.
+         */
+        if (PyArray_MASKNA_DTYPE(operand)->type_num != NPY_BOOL) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "multi-NA isn't implemented yet");
+            return NULL;
+        }
+
+        /* Allocate an array for the reduction counting */
+        result_dtype = PyArray_DescrFromType(NPY_INTP);
+        if (result_dtype == NULL) {
+            return NULL;
+        }
+        result = PyArray_CreateReduceResult(operand, NULL,
+                                result_dtype, axis_flags, 0,
+                                "count_reduce_items");
+        if (result == NULL) {
+            return NULL;
+        }
+
+        /* Initialize result to all zeros */
+        if (PyArray_AssignZero(result, NULL, 0, NULL) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+
+        /*
+         * Set all the reduction strides to 0 in result so
+         * we can use them for raw iteration
+         */
+        strides = PyArray_STRIDES(result);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (axis_flags[idim]) {
+                strides[idim] = 0;
+            }
+        }
+
+        /*
+         * Sort axes based on 'operand', which has more non-zero strides,
+         * by making it the first operand here
+         */
+        if (PyArray_PrepareTwoRawArrayIter(ndim, PyArray_SHAPE(operand),
+                PyArray_MASKNA_DATA(operand), PyArray_MASKNA_STRIDES(operand),
+                            PyArray_DATA(result), PyArray_STRIDES(result),
+                            &ndim, shape_it,
+                            &operand_data, operand_strides_it,
+                            &result_data, result_strides_it) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+
+        /*
+         * NOTE: The following only works for NPY_BOOL masks.
+         */
+        NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+            char *operand_d = operand_data, *result_d = result_data;
+            for (i = 0; i < shape_it[0]; ++i) {
+                *(npy_intp *)result_d += *operand_d;
+
+                operand_d += operand_strides_it[0];
+                result_d += result_strides_it[0];
+            }
+        } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
+                                    operand_data, operand_strides_it,
+                                    result_data, result_strides_it);
+
+        /* Remove the reduction axes and return the result */
+        PyArray_RemoveAxesInPlace(result, axis_flags);
+        return PyArray_Return(result);
+    }
+}
diff --git a/numpy/core/src/multiarray/reduction.h b/numpy/core/src/multiarray/reduction.h
index b2334f896..849db256b 100644
--- a/numpy/core/src/multiarray/reduction.h
+++ b/numpy/core/src/multiarray/reduction.h
@@ -100,8 +100,7 @@ typedef void (PyArray_ReduceInnerLoopFunc)(NpyIter *iter,
  * inner_loop  : The inner loop which does the reduction.
  * masked_inner_loop: The inner loop which does the reduction with a mask.
  * data        : Data which is passed to assign_unit and the inner loop.
- * needs_api   : Should be 1 if the inner loop might call a Python API
- *               function like PyErr_SetString.
+ * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
  */
 NPY_NO_EXPORT PyArrayObject *
@@ -112,6 +111,23 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                         PyArray_AssignReduceUnitFunc *assign_unit,
                         PyArray_ReduceInnerLoopFunc *inner_loop,
                         PyArray_ReduceInnerLoopFunc *masked_inner_loop,
-                        void *data, const char *funcname);
+                        void *data, npy_intp buffersize, const char *funcname);
+
+/*
+ * This function counts the number of elements that a reduction
+ * will see along the reduction directions, given the provided options.
+ *
+ * If the reduction operand has no NA mask or 'skipna' is false, this
+ * is simply the prod`uct of all the reduction axis sizes. A NumPy
+ * scalar is returned in this case.
+ *
+ * If the reduction operand has an NA mask and 'skipna' is true, this
+ * counts the number of elements which are not NA along the reduction
+ * dimensions, and returns an array with the counts.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_CountReduceItems(PyArrayObject *operand,
+                            npy_bool *axis_flags, int skipna);
+
 
 #endif
diff --git a/numpy/core/tests/test_maskna.py b/numpy/core/tests/test_maskna.py
index bee959a6d..f946e8073 100644
--- a/numpy/core/tests/test_maskna.py
+++ b/numpy/core/tests/test_maskna.py
@@ -792,6 +792,80 @@ def check_ufunc_max_1D(max_func):
     a[...] = np.NA
     assert_raises(ValueError, max_func, a, skipna=True)
 
+def test_count_reduce_items():
+    # np.count_reduce_items
+
+    # When skipna is False, it should always return the
+    # product of the reduction axes as a NumPy intp scalar
+    a = np.zeros((2,3,4))
+
+    res = np.count_reduce_items(a)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2))
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # This still holds if 'a' has an NA mask and some NA values
+    a = np.zeros((2,3,4), maskna=True)
+    a[1,2,2] = np.NA
+    a[0,1,2] = np.NA
+    a[1,0,3] = np.NA
+
+    res = np.count_reduce_items(a)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2))
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # If skipna is True, but the array has no NA mask, the result
+    # should still be the product of the reduction axes
+    a = np.zeros((2,3,4))
+
+    res = np.count_reduce_items(a, skipna=True)
+    assert_equal(res, 24)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=0, skipna=True)
+    assert_equal(res, 2)
+    assert_equal(type(res), np.intp)
+
+    res = np.count_reduce_items(a, axis=(1,2), skipna=True)
+    assert_equal(res, 12)
+    assert_equal(type(res), np.intp)
+
+    # Finally, when skipna is True AND the array has an NA mask,
+    # we get an array of counts
+    a = np.zeros((2,3,4), maskna=True)
+    a[1,2,2] = np.NA
+    a[0,1,2] = np.NA
+    a[1,0,3] = np.NA
+
+    # When doing a full reduction, should still get the scalar
+    res = np.count_reduce_items(a, skipna=True)
+    assert_equal(res, 21)
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+    res = np.count_reduce_items(a, axis=0, skipna=True)
+    assert_equal(res, [[2,2,2,1], [2,2,1,2], [2,2,1,2]])
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+    res = np.count_reduce_items(a, axis=(1,2), skipna=True)
+    assert_equal(res, [11,10])
+    assert_equal(res.dtype, np.dtype(np.intp))
+
+
 def test_array_maskna_clip_method():
     # ndarray.clip
     a = np.array([2, np.NA, 10, 4, np.NA, 7], maskna=True)
author	Mark Wiebe <mwwiebe@gmail.com>	2011-08-17 19:18:56 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 07:26:56 -0600
commit	bfda229ec93d37b1ee2cdd8b9443ec4e34536bbf (patch)
tree	1315cce82b32c73eb9390e3fa46d8a8e0ddecebe
parent	d9b3f90de3213ece9a78b77088fdec17910e81d9 (diff)
download	numpy-bfda229ec93d37b1ee2cdd8b9443ec4e34536bbf.tar.gz