ENH: missingdata: Finish count_nonzero as a full-fledged reduction operation

author: Mark Wiebe <mwwiebe@gmail.com> 2011-08-17 15:45:52 -0700
committer: Charles Harris <charlesr.harris@gmail.com> 2011-08-27 07:26:55 -0600
commit: 67ece6bdd2b35d011893e78154dbff6ab51c7d35 (patch)
tree: fa009f26a3c49ebaa51504e0997f0128c3257810 /numpy
parent: 6bfd819a0897caf6e6db244930c40ed0d17b9e62 (diff)
download: numpy-67ece6bdd2b35d011893e78154dbff6ab51c7d35.tar.gz
8 files changed, 332 insertions, 46 deletions
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 5e59bf306..254b5389d 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -199,7 +199,7 @@ PyArray_AxisConverter(PyObject *obj, int *axis)
  * Converts an axis parameter into an ndim-length C-array of
  * boolean flags, True for each axis specified.
  *
- * If obj is None, everything is set to True. If obj is a tuple,
+ * If obj is None or NULL, everything is set to True. If obj is a tuple,
  * each axis within the tuple is set to True. If obj is an integer,
  * just that axis is set to True.
  */
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 28f243600..28fc7e7fa 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1902,10 +1902,16 @@ PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
                         npy_bool *axis_flags, int skipna)
 {
     PyArray_NonzeroFunc *nonzero;
-    int ndim, use_maskna;
+    int use_maskna;
     PyArray_Descr *dtype;
     PyArrayObject *result = NULL;
 
+    /* Iterator parameters */
+    NpyIter *iter = NULL;
+    PyArrayObject *op[2];
+    PyArray_Descr *op_dtypes[2];
+    npy_uint32 flags, op_flags[2];
+
     nonzero = PyArray_DESCR(arr)->f->nonzero;
     if (nonzero == NULL) {
         PyErr_SetString(PyExc_TypeError,
@@ -1914,7 +1920,6 @@ PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
         return NULL;
     }
 
-    ndim = PyArray_NDIM(arr);
     use_maskna = PyArray_HASMASKNA(arr);
 
     /*
@@ -1948,15 +1953,184 @@ PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
         return NULL;
     }
 
+    /*
+     * Do the reduction on the NA mask before the data. This way
+     * we can avoid modifying the outputs which end up masked, obeying
+     * the required NA masking semantics.
+     */
+    if (use_maskna && !skipna) {
+        if (PyArray_ReduceMaskNAArray(result, arr) < 0) {
+            Py_DECREF(result);
+            return NULL;
+        }
+
+        /* Short circuit any calculation if the result is 0-dim NA */
+        if (PyArray_SIZE(result) == 1 &&
+                !NpyMaskValue_IsExposed(
+                            (npy_mask)*PyArray_MASKNA_DATA(result))) {
+            goto finish;
+        }
+    }
+
+    /*
+     * Initialize the result to all zeros, except for the NAs
+     * when skipna is False.
+     */
+    if (PyArray_AssignZero(result, NULL, !skipna, NULL) < 0) {
+        Py_DECREF(result);
+        return NULL;
+    }
+
+    /* Set up the iterator */
+    op[0] = result;
+    op[1] = arr;
+    op_dtypes[0] = PyArray_DescrFromType(NPY_INTP);
+    if (op_dtypes[0] == NULL) {
+        Py_DECREF(result);
+        return NULL;
+    }
+    op_dtypes[1] = NULL;
+
+    flags = NPY_ITER_BUFFERED |
+            NPY_ITER_EXTERNAL_LOOP |
+            NPY_ITER_DONT_NEGATE_STRIDES |
+            NPY_ITER_ZEROSIZE_OK |
+            NPY_ITER_REDUCE_OK |
+            NPY_ITER_REFS_OK;
+    op_flags[0] = NPY_ITER_READWRITE |
+                  NPY_ITER_ALIGNED |
+                  NPY_ITER_NO_SUBTYPE;
+    op_flags[1] = NPY_ITER_READONLY |
+                  NPY_ITER_ALIGNED;
+
+    /* Add mask-related flags */
     if (use_maskna) {
+        if (skipna) {
+            /* The output's mask has been set to all exposed already */
+            op_flags[0] |= NPY_ITER_IGNORE_MASKNA;
+            /* Need the input's mask to determine what to skip */
+            op_flags[1] |= NPY_ITER_USE_MASKNA;
+        }
+        else {
+            /* Iterate over the output's mask */
+            op_flags[0] |= NPY_ITER_USE_MASKNA;
+            /* The input's mask is already incorporated in the output's mask */
+            op_flags[1] |= NPY_ITER_IGNORE_MASKNA;
+        }
+    }
+    else {
         /*
-         * Do the reduction on the NA mask before the data. This way
-         * we can avoid modifying the outputs which end up masked, obeying
-         * the required NA masking semantics.
+         * If 'out' had no mask, and 'arr' did, we checked that 'arr'
+         * contains no NA values and can ignore the masks.
          */
-        if (!skipna) {
+        op_flags[0] |= NPY_ITER_IGNORE_MASKNA;
+        op_flags[1] |= NPY_ITER_IGNORE_MASKNA;
+    }
+
+    iter = NpyIter_MultiNew(2, op, flags,
+                               NPY_KEEPORDER, NPY_UNSAFE_CASTING,
+                               op_flags,
+                               op_dtypes);
+    if (iter == NULL) {
+        Py_DECREF(result);
+        Py_DECREF(op_dtypes[0]);
+        return NULL;
+    }
+
+    if (NpyIter_GetIterSize(iter) != 0) {
+        int needs_api;
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *strideptr;
+        npy_intp *countptr;
+        NPY_BEGIN_THREADS_DEF;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            Py_DECREF(result);
+            Py_DECREF(op_dtypes[0]);
+            NpyIter_Deallocate(iter);
+            return NULL;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strideptr = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+        needs_api = NpyIter_IterationNeedsAPI(iter) ||
+                    PyDataType_REFCHK(PyArray_DESCR(arr));
+
+        if (!needs_api) {
+            NPY_BEGIN_THREADS;
+        }
+
+        /* Straightforward reduction */
+        if (!use_maskna) {
+            do {
+                char *data0 = dataptr[0];
+                char *data1 = dataptr[1];
+                npy_intp stride0 = strideptr[0];
+                npy_intp stride1 = strideptr[1];
+                npy_intp count = *countptr;
+
+                while (count--) {
+                    if (nonzero(data1, arr)) {
+                        ++(*(npy_intp *)data0);
+                    }
+                    data0 += stride0;
+                    data1 += stride1;
+                }
+            } while (iternext(iter));
+        }
+        /* Masked reduction */
+        else {
+            do {
+                char *data0 = dataptr[0];
+                char *data1 = dataptr[1];
+                char *data2 = dataptr[2];
+                npy_intp stride0 = strideptr[0];
+                npy_intp stride1 = strideptr[1];
+                npy_intp stride2 = strideptr[2];
+                npy_intp count = *countptr;
+
+                while (count--) {
+                    if (NpyMaskValue_IsExposed((npy_mask)*data2) &&
+                                                nonzero(data1, arr)) {
+                        ++(*(npy_intp *)data0);
+                    }
+                    data0 += stride0;
+                    data1 += stride1;
+                    data2 += stride2;
+                }
+            } while (iternext(iter));
+        }
+
+        if (!needs_api) {
+            NPY_END_THREADS;
+        }
+
+        if (needs_api && PyErr_Occurred()) {
+            Py_DECREF(result);
+            Py_DECREF(op_dtypes[0]);
+            NpyIter_Deallocate(iter);
+            return NULL;
         }
     }
+
+    Py_DECREF(op_dtypes[0]);
+    NpyIter_Deallocate(iter);
+
+finish:
+    /* Strip out the extra 'one' dimensions in the result */
+    if (out == NULL) {
+        PyArray_RemoveAxesInPlace(result, axis_flags);
+    }
+    else {
+        Py_DECREF(result);
+        result = out;
+        Py_INCREF(result);
+    }
+
+    return PyArray_Return(result);
 }
 
 /*NUMPY_API
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
index fdb6fc39f..722aaa5d1 100644
--- a/numpy/core/src/multiarray/item_selection.h
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -27,5 +27,16 @@ NPY_NO_EXPORT int
 PyArray_MultiIndexSetItem(PyArrayObject *self, npy_intp *multi_index,
                                                 PyObject *obj);
 
+/*
+ * A full reduction version of PyArray_CountNonzero, supporting
+ * an 'out' parameter and doing the count as a reduction along
+ * selected axes. It also supports a 'skipna' parameter, which
+ * skips over any NA masked values in arr.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_ReduceCountNonzero(PyArrayObject *arr, PyArrayObject *out,
+                        npy_bool *axis_flags, int skipna);
+
+
 
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 003be5e39..e3cb3709e 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -42,12 +42,14 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "scalartypes.h"
 #include "numpymemoryview.h"
 #include "convert_datatype.h"
+#include "conversion_utils.h"
 #include "nditer_pywrap.h"
 #include "methods.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "datetime_busday.h"
 #include "datetime_busdaycal.h"
+#include "item_selection.h"
 #include "shape.h"
 #include "ctors.h"
 #include "na_singleton.h"
@@ -2161,13 +2163,21 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args)
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
-    PyObject *array_in;
-    PyArrayObject *array;
-    npy_intp count;
+    static char *kwlist[] = {"arr", "axis", "out", "skipna", NULL};
 
-    if (!PyArg_ParseTuple(args, "O", &array_in)) {
+    PyObject *array_in, *axis_in = NULL, *out_in = NULL;
+    PyObject *ret = NULL;
+    PyArrayObject *array, *out = NULL;
+    npy_bool axis_flags[NPY_MAXDIMS];
+    int skipna = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOi:count_nonzero", kwlist,
+                                &array_in,
+                                &axis_in,
+                                &out_in,
+                                &skipna)) {
         return NULL;
     }
 
@@ -2177,22 +2187,27 @@ array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args)
         return NULL;
     }
 
-    count =  PyArray_CountNonzero(array);
+    if (PyArray_ConvertMultiAxis(axis_in, PyArray_NDIM(array),
+                                        axis_flags) != NPY_SUCCEED) {
+        Py_DECREF(array);
+        return NULL;
+    }
+
+    if (out_in != NULL) {
+        if (PyArray_Check(out_in)) {
+            out = (PyArrayObject *)out_in;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError, "'out' must be an array");
+            return NULL;
+        }
+    }
+
+    ret = PyArray_ReduceCountNonzero(array, out, axis_flags, skipna);
 
     Py_DECREF(array);
 
-#if defined(NPY_PY3K)
-    return (count == -1) ? NULL : PyLong_FromSsize_t(count);
-#elif PY_VERSION_HEX >= 0x02050000
-    return (count == -1) ? NULL : PyInt_FromSsize_t(count);
-#else
-    if ((npy_intp)((long)count) == count) {
-        return (count == -1) ? NULL : PyInt_FromLong(count);
-    }
-    else {
-        return (count == -1) ? NULL : PyLong_FromVoidPtr((void*)count);
-    }
-#endif
+    return ret;
 }
 
 static PyObject *
@@ -3740,7 +3755,7 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
-        METH_VARARGS, NULL},
+        METH_VARARGS|METH_KEYWORDS, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index c64563950..d845140ad 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2740,7 +2740,7 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
                 goto fail;
             }
 
-            /* Short circuit any calculation if the result 0-dim NA */
+            /* Short circuit any calculation if the result is 0-dim NA */
             if (PyArray_SIZE(result) == 1 &&
                     !NpyMaskValue_IsExposed(
                                 (npy_mask)*PyArray_MASKNA_DATA(result))) {
@@ -2815,8 +2815,6 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
             op_flags[0] |= NPY_ITER_IGNORE_MASKNA;
             /* Need the input's mask to determine what to skip */
             op_flags[1] |= NPY_ITER_USE_MASKNA;
-
-            /* TODO: allocate a temporary buffer for inverting the mask */
         }
         else {
             /* Iterate over the output's mask */
diff --git a/numpy/core/tests/test_maskna.py b/numpy/core/tests/test_maskna.py
index b839b661e..bee959a6d 100644
--- a/numpy/core/tests/test_maskna.py
+++ b/numpy/core/tests/test_maskna.py
@@ -600,7 +600,7 @@ def test_maskna_nonzero_1D():
 
     # The nonzeros with an NA
     a[2] = np.NA
-    assert_raises(ValueError, np.count_nonzero, a)
+    assert_(np.isna(np.count_nonzero(a)))
     assert_raises(ValueError, np.nonzero, a)
 
 def test_maskna_take_1D():
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index db3a879ca..2f8c2fcc9 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -594,6 +594,50 @@ class TestNonzero(TestCase):
         assert_equal(np.nonzero(x['a'].T), ([0,1,1,2],[1,1,2,0]))
         assert_equal(np.nonzero(x['b'].T), ([0,0,1,2,2],[0,1,2,0,2]))
 
+    def test_count_nonzero_axis(self):
+        a = array([[0,1,0],[2,3,0]])
+        assert_equal(np.count_nonzero(a, axis=()), [[0,1,0],[1,1,0]])
+        assert_equal(np.count_nonzero(a, axis=0), [1,2,0])
+        assert_equal(np.count_nonzero(a, axis=1), [1,2])
+        assert_equal(np.count_nonzero(a, axis=(0,1)), 3)
+
+        res = array([-1,-1,-1], dtype='i2')
+        np.count_nonzero(a, axis=0, out=res)
+        assert_equal(res, [1,2,0])
+
+        # A 3-dimensional array with an NA
+        a = array([[[0,1,0],[2,np.NA,0]], [[0,1,0],[2,3,0]]], maskna=True)
+
+        # Test that the NA reduces correctly
+        assert_array_equal(np.count_nonzero(a, axis=()),
+                            [[[0,1,0],[1,np.NA,0]], [[0,1,0],[1,1,0]]])
+        assert_array_equal(np.count_nonzero(a, axis=0), [[0,2,0], [2,np.NA,0]])
+        assert_array_equal(np.count_nonzero(a, axis=1), [[1,np.NA,0], [1,2,0]])
+        assert_array_equal(np.count_nonzero(a, axis=2), [[1,np.NA], [1,2]])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1)), [2,np.NA,0])
+        assert_array_equal(np.count_nonzero(a, axis=(0,2)), [2,np.NA])
+        assert_array_equal(np.count_nonzero(a, axis=(1,2)), [np.NA,3])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1,2)),
+                                                np.NA(dtype=np.intp))
+        assert_array_equal(np.count_nonzero(a, axis=None),
+                                                np.NA(dtype=np.intp))
+
+        # Test that the NA gets skipped correctly
+        assert_array_equal(np.count_nonzero(a, axis=(), skipna=True),
+                            [[[0,1,0],[1,0,0]], [[0,1,0],[1,1,0]]])
+        assert_array_equal(np.count_nonzero(a, axis=0, skipna=True),
+                            [[0,2,0], [2,1,0]])
+        assert_array_equal(np.count_nonzero(a, axis=1, skipna=True),
+                            [[1,1,0], [1,2,0]])
+        assert_array_equal(np.count_nonzero(a, axis=2, skipna=True),
+                            [[1,1], [1,2]])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1), skipna=True),
+                            [2,3,0])
+        assert_array_equal(np.count_nonzero(a, axis=(0,2), skipna=True), [2,3])
+        assert_array_equal(np.count_nonzero(a, axis=(1,2), skipna=True), [2,3])
+        assert_array_equal(np.count_nonzero(a, axis=(0,1,2), skipna=True), 5)
+        assert_array_equal(np.count_nonzero(a, axis=None, skipna=True), 5)
+
 class TestIndex(TestCase):
     def test_boolean(self):
         a = rand(3,5,8)
diff --git a/numpy/testing/utils.py b/numpy/testing/utils.py
index 7753427c0..a0e395c45 100644
--- a/numpy/testing/utils.py
+++ b/numpy/testing/utils.py
@@ -567,7 +567,7 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
 
 def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                          header=''):
-    from numpy.core import array, isnan, isinf, any
+    from numpy.core import array, isnan, isinf, isna, any, all, inf
     x = array(x, copy=False, subok=True)
     y = array(y, copy=False, subok=True)
 
@@ -598,22 +598,64 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             if not cond :
                 raise AssertionError(msg)
 
-        if (isnumber(x) and isnumber(y)) and (any(isnan(x)) or any(isnan(y))):
-            x_id = isnan(x)
-            y_id = isnan(y)
-            chk_same_position(x_id, y_id, hasval='nan')
-            # If only one item, it was a nan, so just return
-            if x.size == y.size == 1:
+        if isnumber(x) and isnumber(y):
+            x_isna, y_isna = isna(x), isna(y)
+            x_isnan, y_isnan = isnan(x), isnan(y)
+            x_isinf, y_isinf = isinf(x), isinf(y)
+
+            # Remove any NAs from the isnan and isinf arrays
+            if x.ndim == 0:
+                if x_isna:
+                    x_isnan = False
+                    x_isinf = False
+            else:
+                x_isnan[x_isna] = False
+                x_isinf[x_isna] = False
+            if y.ndim == 0:
+                if y_isna:
+                    y_isnan = False
+                    y_isinf = False
+            else:
+                y_isnan[y_isna] = False
+                y_isinf[y_isna] = False
+
+
+            # Validate that the special values are in the same place
+            if any(x_isnan) or any(y_isnan):
+                chk_same_position(x_isnan, y_isnan, hasval='nan')
+            if any(x_isinf) or any(y_isinf):
+                # Check +inf and -inf separately, since they are different
+                chk_same_position(x == +inf, y == +inf, hasval='+inf')
+                chk_same_position(x == -inf, y == -inf, hasval='-inf')
+            if any(x_isna) or any(y_isna):
+                chk_same_position(x_isna, y_isna, hasval='NA')
+
+            # Combine all the special values
+            x_id, y_id = x_isnan, y_isnan
+            x_id |= x_isinf
+            y_id |= y_isinf
+            x_id |= x_isna
+            y_id |= y_isna
+
+            # Only do the comparison if actual values are left
+            if all(x_id):
                 return
-            val = comparison(x[~x_id], y[~y_id])
-        elif (isnumber(x) and isnumber(y)) and (any(isinf(x)) or any(isinf(y))):
-            x_id = isinf(x)
-            y_id = isinf(y)
-            chk_same_position(x_id, y_id, hasval='inf')
-            # If only one item, it was a inf, so just return
-            if x.size == y.size == 1:
+
+            if any(x_id):
+                val = comparison(x[~x_id], y[~y_id])
+            else:
+                val = comparison(x, y)
+        # field-NA isn't supported yet, so skip struct dtypes for this
+        elif (not x.dtype.names and not y.dtype.names) and \
+                    (any(isna(x)) or any(isna(y))):
+            x_isna, y_isna = isna(x), isna(y)
+
+            if any(x_isna) or any(y_isna):
+                chk_same_position(x_isna, y_isna, hasval='NA')
+
+            if all(x_isna):
                 return
-            val = comparison(x[~x_id], y[~y_id])
+            val = comparison(x[~x_isna], y[~y_isna])
         else:
             val = comparison(x,y)
 
@@ -650,7 +692,9 @@ def assert_array_equal(x, y, err_msg='', verbose=True):
     elements of these objects are equal. An exception is raised at
     shape mismatch or conflicting values. In contrast to the standard usage
     in numpy, NaNs are compared like numbers, no assertion is raised if
-    both objects have NaNs in the same positions.
+    both objects have NaNs in the same positions. Similarly, NAs are compared
+    like numbers, no assertion is raised if both objects have NAs in the
+    same positions.
 
     The usual caution for verifying equality with floating point numbers is
     advised.
author	Mark Wiebe <mwwiebe@gmail.com>	2011-08-17 15:45:52 -0700
committer	Charles Harris <charlesr.harris@gmail.com>	2011-08-27 07:26:55 -0600
commit	67ece6bdd2b35d011893e78154dbff6ab51c7d35 (patch)
tree	fa009f26a3c49ebaa51504e0997f0128c3257810 /numpy
parent	6bfd819a0897caf6e6db244930c40ed0d17b9e62 (diff)
download	numpy-67ece6bdd2b35d011893e78154dbff6ab51c7d35.tar.gz