Add histogramnd and fix histogram2d

author: Travis Oliphant <oliphant@enthought.com> 2006-09-14 01:49:10 +0000
committer: Travis Oliphant <oliphant@enthought.com> 2006-09-14 01:49:10 +0000
commit: 4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a (patch)
tree: 39a078cf6b53847505be2cb55dad65fcfc1056e4 /numpy
parent: 6359dccac9d940dc3291de360b4cb377183e1b9d (diff)
download: numpy-4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a.tar.gz
5 files changed, 238 insertions, 45 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 29308d443..4701cb03e 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4,23 +4,23 @@ __all__ = ['logspace', 'linspace',
            'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp',
            'unique', 'extract', 'place', 'nansum', 'nanmax', 'nanargmax',
            'nanargmin', 'nanmin', 'vectorize', 'asarray_chkfinite', 'average',
-           'histogram', 'bincount', 'digitize', 'cov', 'corrcoef', 'msort',
-           'median', 'sinc', 'hamming', 'hanning', 'bartlett', 'blackman',
-           'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring', 'meshgrid',
-           'delete', 'insert', 'append'
+           'histogram', 'histogramnd', 'bincount', 'digitize', 'cov',
+           'corrcoef', 'msort', 'median', 'sinc', 'hamming', 'hanning',
+           'bartlett', 'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc',
+           'add_docstring', 'meshgrid', 'delete', 'insert', 'append'
            ]
 
 import types
 import numpy.core.numeric as _nx
 from numpy.core.numeric import ones, zeros, arange, concatenate, array, \
-     asarray, asanyarray, empty, empty_like, asanyarray, ndarray
+     asarray, asanyarray, empty, empty_like, asanyarray, ndarray, around
 from numpy.core.numeric import ScalarType, dot, where, newaxis, intp, \
-     integer
+     integer, isscalar
 from numpy.core.umath import pi, multiply, add, arctan2,  \
-     frompyfunc, isnan, cos, less_equal, sqrt, sin, mod, exp
+     frompyfunc, isnan, cos, less_equal, sqrt, sin, mod, exp, log10
 from numpy.core.fromnumeric import ravel, nonzero, choose, sort
 from numpy.core.numerictypes import typecodes
-from numpy.lib.shape_base import atleast_1d
+from numpy.lib.shape_base import atleast_1d, atleast_2d
 from numpy.lib.twodim_base import diag
 from _compiled_base import _insert, add_docstring
 from _compiled_base import digitize, bincount
@@ -75,8 +75,7 @@ def histogram(a, bins=10, range=None, normed=False):
     ----------
     bins: Number of bins
     range: Lower and upper bin edges (default: [sample.min(), sample.max()]).
-        Does not really work, all values greater than range are stored in
-        the last bin.
+        All values greater than range are stored in the last bin.
     normed: If False (default), return the number of samples in each bin.
         If True, return a frequency distribution.
 
@@ -104,6 +103,130 @@ def histogram(a, bins=10, range=None, normed=False):
     else:
         return n, bins
 
+def  histogramnd(sample, bins=10, range=None, normed=False):
+    """histogramnd(sample, bins = 10, range = None, normed = False) -> H, edges
+    
+    Return the N-dimensional histogram computed from sample.
+    
+    Parameters
+    ----------
+    sample: A sequence of N arrays, or an KxN array. 
+    bins:   A sequence of edge arrays, or a sequence of the number of bins. 
+            If a scalar is given, it is assumed to be the number of bins
+            for all dimensions. 
+    range:  A sequence of lower and upper bin edges (default: [min, max]).
+    normed: If False, returns the number of samples in each bin. 
+            If True, returns the frequency distribution.
+    
+    
+    Output
+    ------
+    H:      Histogram array.
+    edges:  List of arrays defining the bin edges. 
+    
+    Example:
+    x = random.randn(100,3)
+    H, edges = histogramnd(x, bins = (5, 6, 7))
+    
+    See also: histogram
+    """
+    
+    try:
+        N, D = sample.shape
+    except (AttributeError, ValueError):
+        ss = atleast_2d(sample)
+        sample = ss.transpose()
+        N, D = sample.shape
+    
+    nbin = empty(D, int)
+    edges = D*[None]
+    dedges = D*[None]
+
+    try:
+        M = len(bins)
+        if M != D:
+            raise AttributeError, 'The dimension of bins must be a equal to the dimension of the sample x.'
+    except TypeError:
+        bins = D*[bins]
+        
+    if range is None:
+        smin = atleast_1d(sample.min(0))
+        smax = atleast_1d(sample.max(0))
+    else:
+        smin = zeros(D)
+        smax = zeros(D)
+        for i in arange(D):
+            smin[i], smax[i] = range[i]
+    
+    for i in arange(D):
+        if isscalar(bins[i]):
+            nbin[i] = bins[i]
+            edges[i] = linspace(smin[i], smax[i], nbin[i]+1)
+        else:
+            edges[i] = asarray(bins[i], float)
+            nbin[i] = len(edges[i])-1
+            
+
+
+    Ncount = {}
+    nbin =  asarray(nbin)
+
+    for i in arange(D):
+        Ncount[i] = digitize(sample[:,i], edges[i]) 
+        dedges[i] = diff(edges[i])
+    # Remove values falling outside of bins
+    # Values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right 
+    # edge to be counted in the last bin, and not as an outlier.
+    outliers = zeros(N, int)
+    for i in arange(D):
+        decimal = int(-log10(dedges[i].min())) +6
+        on_edge = where(around(sample[:,i], decimal) == around(edges[i][-1], decimal))[0]
+        Ncount[i][on_edge] -= 1
+        outliers += (Ncount[i] == 0) | (Ncount[i] == nbin[i]+1)
+    indices = where(outliers == 0)[0]
+    for i in arange(D):
+        Ncount[i] = Ncount[i][indices] - 1
+    N = len(indices)
+    
+    # Flattened histogram matrix (1D)
+    hist = zeros(nbin.prod(), int)
+    
+    # Compute the sample indices in the flattened histogram matrix.
+    ni = nbin.argsort()
+    shape = []
+    xy = zeros(N, int)
+    for i in arange(0, D-1):
+        xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod()
+        
+    xy += Ncount[ni[-1]]
+
+    # Compute the number of repetitions in xy and assign it to the flattened histmat. 
+    if len(xy) == 0:
+        return zeros(nbin, int)
+        
+    flatcount = bincount(xy)
+    a = arange(len(flatcount))
+    hist[a] = flatcount
+
+    # Shape into a proper matrix
+    hist = hist.reshape(sort(nbin))
+    for i,j in enumerate(ni):
+        hist = hist.swapaxes(i,j)
+        if (hist.shape == nbin).all():
+            break
+    
+    if normed:
+        s = hist.sum()
+        for i in arange(D):
+            shape = ones(D, int)
+            shape[i] = nbin[i]
+            hist = hist / dedges[i].reshape(shape)
+        hist /= s
+        
+    return hist, edges 
+    
+
 def average(a, axis=None, weights=None, returned=False):
     """average(a, axis=None weights=None, returned=False)
 
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index b548ce386..a8ccf5eaa 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -353,6 +353,31 @@ class test_histogram(NumpyTestCase):
         (a,b)=histogram(linspace(0,10,100))
         assert(all(a==10))
 
+class test_histogramnd(NumpyTestCase):
+    def check_simple(self):
+        x = array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], \
+        [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]])
+        H, edges = histogramnd(x, (2,3,3), range = [[-1,1], [0,3], [0,3]])
+        answer = asarray([[[0,1,0], [0,0,1], [1,0,0]], [[0,1,0], [0,0,1], [0,0,1]]])
+        assert(all(H == answer))
+        # Check normalization
+        ed = [[-2,0,2], [0,1,2,3], [0,1,2,3]]
+        H, edges = histogramnd(x, bins = ed, normed = True)
+        assert(all(H == answer/12.))
+        # Check that H has the correct shape.
+        H, edges = histogramnd(x, (2,3,4), range = [[-1,1], [0,3], [0,4]], normed=True)
+        answer = asarray([[[0,1,0,0], [0,0,1,0], [1,0,0,0]], [[0,1,0,0], [0,0,1,0], [0,0,1,0]]])
+        assert_array_almost_equal(H, answer/6., 4)
+        # Check that a sequence of arrays is accepted and H has the correct shape.
+        z = [squeeze(y) for y in split(x,3,axis=1)]
+        H, edges = histogramnd(z, bins=(4,3,2),range=[[-2,2], [0,3], [0,2]])
+        answer = asarray([[[0,0],[0,0],[0,0]], 
+                          [[0,1], [0,0], [1,0]], 
+                          [[0,1], [0,0],[0,0]], 
+                          [[0,0],[0,0],[0,0]]])
+        assert_array_equal(H, answer)
+                
+
 class test_unique(NumpyTestCase):
     def check_simple(self):
         x = array([4,3,2,1,1,2,3,4, 0])
diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py
index cfe51bb9c..3c6edfd24 100644
--- a/numpy/lib/tests/test_twodim_base.py
+++ b/numpy/lib/tests/test_twodim_base.py
@@ -5,7 +5,8 @@
 from numpy.testing import *
 set_package_path()
 from numpy import arange, rot90, add, fliplr, flipud, zeros, ones, eye, \
-     array, diag
+     array, diag, histogram2d
+import numpy as np
 restore_path()
 
 ##################################################
@@ -133,13 +134,12 @@ class test_rot90(NumpyTestCase):
 
 class test_histogram2d(NumpyTestCase):
     def check_simple(self):
-        import numpy as np
         x = array([ 0.41702200,  0.72032449,  0.00011437481, 0.302332573,  0.146755891])
         y = array([ 0.09233859,  0.18626021,  0.34556073,  0.39676747,  0.53881673])
         xedges = np.linspace(0,1,10)
         yedges = np.linspace(0,1,10)
-        H = np.histogram2d(x, y, (xedges, yedges))[0]
-        answer = np.array([[0, 0, 0, 1, 0, 0, 0, 0, 0],
+        H = histogram2d(x, y, (xedges, yedges))[0]
+        answer = array([[0, 0, 0, 1, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 1, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [1, 0, 1, 0, 0, 0, 0, 0, 0],
@@ -148,7 +148,26 @@ class test_histogram2d(NumpyTestCase):
                            [0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0]])
-        assert_equal(H, answer)
-
+        assert_array_equal(H.T, answer)
+    def check_asym(self):
+        x = array([1, 1, 2, 3, 4, 4, 4, 5])
+        y = array([1, 3, 2, 0, 1, 2, 3, 4])
+        H, xed, yed = histogram2d(x,y, (6, 5), range = [[0,6],[0,5]], normed=True)
+        answer = array([[0.,0,0,0,0],
+                        [0,1,0,1,0],
+                        [0,0,1,0,0],
+                        [1,0,0,0,0],
+                        [0,1,1,1,0],
+                        [0,0,0,0,1]])
+        assert_array_almost_equal(H, answer/8., 3)
+    def check_norm(self):
+        x = array([1,2,3,1,2,3,1,2,3])
+        y = array([1,1,1,2,2,2,3,3,3]) 
+        H, xed, yed = histogram2d(x,y,[[1,2,3,5], [1,2,3,5]], normed=True)
+        answer=array([[1,1,.5],
+                     [1,1,.5],
+                     [.5,.5,.25]])/9.
+        assert_array_almost_equal(H, answer, 3)
+        
 if __name__ == "__main__":
     NumpyTest().run()
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 458378593..512b6bf53 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -2,8 +2,8 @@
 
 """
 
-__all__ = ['diag','diagflat','eye','fliplr','flipud','rot90','tri','triu','tril',
-           'vander','histogram2d']
+__all__ = ['diag','diagflat','eye','fliplr','flipud','rot90','tri','triu',
+           'tril','vander','histogram2d']
 
 from numpy.core.numeric import asanyarray, int_, equal, subtract, arange, \
      zeros, arange, greater_equal, multiply, ones, asarray
@@ -143,14 +143,21 @@ def vander(x, N=None):
         X[:,i] = x**(N-i-1)
     return X
 
-def  histogram2d(x,y, bins, normed = False):
-    """Compute the 2D histogram for a dataset (x,y) given the edges or
-         the number of bins. 
-
-    Returns histogram, xedges, yedges.
-    The histogram array is a count of the number of samples in each bin. 
-    The array is oriented such that H[i,j] is the number of samples falling
-        into binx[j] and biny[i]. 
+def  histogram2d(x,y, bins=10, range=None, normed=False):
+    """histogram2d(x,y, bins=10, range=None, normed=False) -> H, xedges, yedges
+    
+    Compute the 2D histogram from samples x,y. 
+
+    Parameters
+    ----------
+    x,y: 1D data series. Both arrays must have the same length.
+    bins: Number of bins -or- [nbin x, nbin y] -or- 
+         [bin edges] -or- [x bin edges, y bin edges].
+    range:  A sequence of lower and upper bin edges (default: [min, max]).
+    normed: True or False. 
+    
+    The histogram array is a count of the number of samples in each 
+    two dimensional bin. 
     Setting normed to True returns a density rather than a bin count. 
     Data falling outside of the edges are not counted.
     """
@@ -160,33 +167,40 @@ def  histogram2d(x,y, bins, normed = False):
     except TypeError:
         N = 1
         bins = [bins]
+    x = asarray(x)
+    y = asarray(y)
+    if range is None:
+        xmin, xmax = x.min(), x.max()
+        ymin, ymax = y.min(), y.max()
+    else:
+        xmin, xmax = range[0]
+        ymin, ymax = range[1]
     if N == 2:
         if np.isscalar(bins[0]):
             xnbin = bins[0]
-            xedges = np.linspace(x.min(), x.max(), xnbin+1)
-            
+            xedges = np.linspace(xmin, xmax, xnbin+1)
         else:
             xedges = asarray(bins[0], float)
             xnbin = len(xedges)-1
-        
         if np.isscalar(bins[1]):
             ynbin = bins[1]
-            yedges = np.linspace(y.min(), y.max(), ynbin+1)
+            yedges = np.linspace(ymin, ymax, ynbin+1)
         else:
             yedges = asarray(bins[1], float)
             ynbin = len(yedges)-1
     elif N == 1:
         ynbin = xnbin = bins[0]
-        xedges = np.linspace(x.min(), x.max(), xnbin+1)
-        yedges = np.linspace(y.max(), y.min(), ynbin+1)
-        xedges[-1] *= 1.0001
-        yedges[-1] *= 1.0001         
+        xedges = np.linspace(xmin, xmax, xnbin+1)
+        yedges = np.linspace(ymin, ymax, ynbin+1)
     else:
         yedges = asarray(bins, float)
         xedges = yedges.copy()
         ynbin = len(yedges)-1
         xnbin = len(xedges)-1
     
+    dxedges = np.diff(xedges)
+    dyedges = np.diff(yedges)
+    
     # Flattened histogram matrix (1D)
     hist = np.zeros((xnbin)*(ynbin), int)
 
@@ -194,30 +208,41 @@ def  histogram2d(x,y, bins, normed = False):
     xbin = np.digitize(x,xedges) 
     ybin = np.digitize(y,yedges) 
  
-    # Remove the outliers
+    # Values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right 
+    # edge to be counted in the last bin, and not as an outlier.
+    xdecimal = int(-np.log10(dxedges.min()))+6
+    ydecimal = int(-np.log10(dyedges.min()))+6
+    on_edge_x = np.where(np.around(x,xdecimal) == np.around(xedges[-1], xdecimal))[0]
+    on_edge_y = np.where(np.around(y,ydecimal) == np.around(yedges[-1], ydecimal))[0]
+    xbin[on_edge_x] -= 1
+    ybin[on_edge_y] -= 1
+    # Remove the true outliers
     outliers = (xbin==0) | (xbin==xnbin+1) | (ybin==0) | (ybin == ynbin+1)
-
-    xbin = xbin[outliers==False]
-    ybin = ybin[outliers==False]
+    xbin = xbin[outliers==False] - 1
+    ybin = ybin[outliers==False] - 1
     
     # Compute the sample indices in the flattened histogram matrix.
     if xnbin >= ynbin:
         xy = ybin*(xnbin) + xbin
-        shift = xnbin + 1
+        
     else:
         xy = xbin*(ynbin) + ybin
-        shift = ynbin + 1
+        
        
     # Compute the number of repetitions in xy and assign it to the flattened
     #  histogram matrix.
+
     flatcount = np.bincount(xy)
-    indices = np.arange(len(flatcount)-shift)
-    hist[indices] = flatcount[shift:]
+    indices = np.arange(len(flatcount))
+    hist[indices] = flatcount
 
+    shape = np.sort([xnbin, ynbin])
     # Shape into a proper matrix
-    histmat = hist.reshape(xnbin, ynbin)
-    
+    histmat = hist.reshape(shape)
+    if (shape == (ynbin, xnbin)).all():
+        histmat = histmat.T
     if normed:
-        diff2 = np.outer(np.diff(yedges), np.diff(xedges))
+        diff2 = np.outer(dxedges, dyedges)
         histmat = histmat / diff2 / histmat.sum()
     return histmat, xedges, yedges
diff --git a/numpy/oldnumeric/functions.py b/numpy/oldnumeric/functions.py
index f03f2fb21..475802c88 100644
--- a/numpy/oldnumeric/functions.py
+++ b/numpy/oldnumeric/functions.py
@@ -119,3 +119,4 @@ def cross_product(a, b, axis1=-1, axis2=-1):
 
 def average(a, axis=0, weights=None, returned=False):
     return N.average(a, axis, weights, returned)
+
author	Travis Oliphant <oliphant@enthought.com>	2006-09-14 01:49:10 +0000
committer	Travis Oliphant <oliphant@enthought.com>	2006-09-14 01:49:10 +0000
commit	4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a (patch)
tree	39a078cf6b53847505be2cb55dad65fcfc1056e4 /numpy
parent	6359dccac9d940dc3291de360b4cb377183e1b9d (diff)
download	numpy-4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a.tar.gz