diff options
author | Travis Oliphant <oliphant@enthought.com> | 2006-09-14 01:49:10 +0000 |
---|---|---|
committer | Travis Oliphant <oliphant@enthought.com> | 2006-09-14 01:49:10 +0000 |
commit | 4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a (patch) | |
tree | 39a078cf6b53847505be2cb55dad65fcfc1056e4 /numpy | |
parent | 6359dccac9d940dc3291de360b4cb377183e1b9d (diff) | |
download | numpy-4e76e00cc5afceaf70fe8d655cf59d4a9fb85a0a.tar.gz |
Add histogramnd and fix histogram2d
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/lib/function_base.py | 143 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 25 | ||||
-rw-r--r-- | numpy/lib/tests/test_twodim_base.py | 31 | ||||
-rw-r--r-- | numpy/lib/twodim_base.py | 83 | ||||
-rw-r--r-- | numpy/oldnumeric/functions.py | 1 |
5 files changed, 238 insertions, 45 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 29308d443..4701cb03e 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4,23 +4,23 @@ __all__ = ['logspace', 'linspace', 'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp', 'unique', 'extract', 'place', 'nansum', 'nanmax', 'nanargmax', 'nanargmin', 'nanmin', 'vectorize', 'asarray_chkfinite', 'average', - 'histogram', 'bincount', 'digitize', 'cov', 'corrcoef', 'msort', - 'median', 'sinc', 'hamming', 'hanning', 'bartlett', 'blackman', - 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring', 'meshgrid', - 'delete', 'insert', 'append' + 'histogram', 'histogramnd', 'bincount', 'digitize', 'cov', + 'corrcoef', 'msort', 'median', 'sinc', 'hamming', 'hanning', + 'bartlett', 'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', + 'add_docstring', 'meshgrid', 'delete', 'insert', 'append' ] import types import numpy.core.numeric as _nx from numpy.core.numeric import ones, zeros, arange, concatenate, array, \ - asarray, asanyarray, empty, empty_like, asanyarray, ndarray + asarray, asanyarray, empty, empty_like, asanyarray, ndarray, around from numpy.core.numeric import ScalarType, dot, where, newaxis, intp, \ - integer + integer, isscalar from numpy.core.umath import pi, multiply, add, arctan2, \ - frompyfunc, isnan, cos, less_equal, sqrt, sin, mod, exp + frompyfunc, isnan, cos, less_equal, sqrt, sin, mod, exp, log10 from numpy.core.fromnumeric import ravel, nonzero, choose, sort from numpy.core.numerictypes import typecodes -from numpy.lib.shape_base import atleast_1d +from numpy.lib.shape_base import atleast_1d, atleast_2d from numpy.lib.twodim_base import diag from _compiled_base import _insert, add_docstring from _compiled_base import digitize, bincount @@ -75,8 +75,7 @@ def histogram(a, bins=10, range=None, normed=False): ---------- bins: Number of bins range: Lower and upper bin edges (default: [sample.min(), sample.max()]). - Does not really work, all values greater than range are stored in - the last bin. + All values greater than range are stored in the last bin. normed: If False (default), return the number of samples in each bin. If True, return a frequency distribution. @@ -104,6 +103,130 @@ def histogram(a, bins=10, range=None, normed=False): else: return n, bins +def histogramnd(sample, bins=10, range=None, normed=False): + """histogramnd(sample, bins = 10, range = None, normed = False) -> H, edges + + Return the N-dimensional histogram computed from sample. + + Parameters + ---------- + sample: A sequence of N arrays, or an KxN array. + bins: A sequence of edge arrays, or a sequence of the number of bins. + If a scalar is given, it is assumed to be the number of bins + for all dimensions. + range: A sequence of lower and upper bin edges (default: [min, max]). + normed: If False, returns the number of samples in each bin. + If True, returns the frequency distribution. + + + Output + ------ + H: Histogram array. + edges: List of arrays defining the bin edges. + + Example: + x = random.randn(100,3) + H, edges = histogramnd(x, bins = (5, 6, 7)) + + See also: histogram + """ + + try: + N, D = sample.shape + except (AttributeError, ValueError): + ss = atleast_2d(sample) + sample = ss.transpose() + N, D = sample.shape + + nbin = empty(D, int) + edges = D*[None] + dedges = D*[None] + + try: + M = len(bins) + if M != D: + raise AttributeError, 'The dimension of bins must be a equal to the dimension of the sample x.' + except TypeError: + bins = D*[bins] + + if range is None: + smin = atleast_1d(sample.min(0)) + smax = atleast_1d(sample.max(0)) + else: + smin = zeros(D) + smax = zeros(D) + for i in arange(D): + smin[i], smax[i] = range[i] + + for i in arange(D): + if isscalar(bins[i]): + nbin[i] = bins[i] + edges[i] = linspace(smin[i], smax[i], nbin[i]+1) + else: + edges[i] = asarray(bins[i], float) + nbin[i] = len(edges[i])-1 + + + + Ncount = {} + nbin = asarray(nbin) + + for i in arange(D): + Ncount[i] = digitize(sample[:,i], edges[i]) + dedges[i] = diff(edges[i]) + # Remove values falling outside of bins + # Values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right + # edge to be counted in the last bin, and not as an outlier. + outliers = zeros(N, int) + for i in arange(D): + decimal = int(-log10(dedges[i].min())) +6 + on_edge = where(around(sample[:,i], decimal) == around(edges[i][-1], decimal))[0] + Ncount[i][on_edge] -= 1 + outliers += (Ncount[i] == 0) | (Ncount[i] == nbin[i]+1) + indices = where(outliers == 0)[0] + for i in arange(D): + Ncount[i] = Ncount[i][indices] - 1 + N = len(indices) + + # Flattened histogram matrix (1D) + hist = zeros(nbin.prod(), int) + + # Compute the sample indices in the flattened histogram matrix. + ni = nbin.argsort() + shape = [] + xy = zeros(N, int) + for i in arange(0, D-1): + xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() + + xy += Ncount[ni[-1]] + + # Compute the number of repetitions in xy and assign it to the flattened histmat. + if len(xy) == 0: + return zeros(nbin, int) + + flatcount = bincount(xy) + a = arange(len(flatcount)) + hist[a] = flatcount + + # Shape into a proper matrix + hist = hist.reshape(sort(nbin)) + for i,j in enumerate(ni): + hist = hist.swapaxes(i,j) + if (hist.shape == nbin).all(): + break + + if normed: + s = hist.sum() + for i in arange(D): + shape = ones(D, int) + shape[i] = nbin[i] + hist = hist / dedges[i].reshape(shape) + hist /= s + + return hist, edges + + def average(a, axis=None, weights=None, returned=False): """average(a, axis=None weights=None, returned=False) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index b548ce386..a8ccf5eaa 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -353,6 +353,31 @@ class test_histogram(NumpyTestCase): (a,b)=histogram(linspace(0,10,100)) assert(all(a==10)) +class test_histogramnd(NumpyTestCase): + def check_simple(self): + x = array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], \ + [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) + H, edges = histogramnd(x, (2,3,3), range = [[-1,1], [0,3], [0,3]]) + answer = asarray([[[0,1,0], [0,0,1], [1,0,0]], [[0,1,0], [0,0,1], [0,0,1]]]) + assert(all(H == answer)) + # Check normalization + ed = [[-2,0,2], [0,1,2,3], [0,1,2,3]] + H, edges = histogramnd(x, bins = ed, normed = True) + assert(all(H == answer/12.)) + # Check that H has the correct shape. + H, edges = histogramnd(x, (2,3,4), range = [[-1,1], [0,3], [0,4]], normed=True) + answer = asarray([[[0,1,0,0], [0,0,1,0], [1,0,0,0]], [[0,1,0,0], [0,0,1,0], [0,0,1,0]]]) + assert_array_almost_equal(H, answer/6., 4) + # Check that a sequence of arrays is accepted and H has the correct shape. + z = [squeeze(y) for y in split(x,3,axis=1)] + H, edges = histogramnd(z, bins=(4,3,2),range=[[-2,2], [0,3], [0,2]]) + answer = asarray([[[0,0],[0,0],[0,0]], + [[0,1], [0,0], [1,0]], + [[0,1], [0,0],[0,0]], + [[0,0],[0,0],[0,0]]]) + assert_array_equal(H, answer) + + class test_unique(NumpyTestCase): def check_simple(self): x = array([4,3,2,1,1,2,3,4, 0]) diff --git a/numpy/lib/tests/test_twodim_base.py b/numpy/lib/tests/test_twodim_base.py index cfe51bb9c..3c6edfd24 100644 --- a/numpy/lib/tests/test_twodim_base.py +++ b/numpy/lib/tests/test_twodim_base.py @@ -5,7 +5,8 @@ from numpy.testing import * set_package_path() from numpy import arange, rot90, add, fliplr, flipud, zeros, ones, eye, \ - array, diag + array, diag, histogram2d +import numpy as np restore_path() ################################################## @@ -133,13 +134,12 @@ class test_rot90(NumpyTestCase): class test_histogram2d(NumpyTestCase): def check_simple(self): - import numpy as np x = array([ 0.41702200, 0.72032449, 0.00011437481, 0.302332573, 0.146755891]) y = array([ 0.09233859, 0.18626021, 0.34556073, 0.39676747, 0.53881673]) xedges = np.linspace(0,1,10) yedges = np.linspace(0,1,10) - H = np.histogram2d(x, y, (xedges, yedges))[0] - answer = np.array([[0, 0, 0, 1, 0, 0, 0, 0, 0], + H = histogram2d(x, y, (xedges, yedges))[0] + answer = array([[0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], @@ -148,7 +148,26 @@ class test_histogram2d(NumpyTestCase): [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]]) - assert_equal(H, answer) - + assert_array_equal(H.T, answer) + def check_asym(self): + x = array([1, 1, 2, 3, 4, 4, 4, 5]) + y = array([1, 3, 2, 0, 1, 2, 3, 4]) + H, xed, yed = histogram2d(x,y, (6, 5), range = [[0,6],[0,5]], normed=True) + answer = array([[0.,0,0,0,0], + [0,1,0,1,0], + [0,0,1,0,0], + [1,0,0,0,0], + [0,1,1,1,0], + [0,0,0,0,1]]) + assert_array_almost_equal(H, answer/8., 3) + def check_norm(self): + x = array([1,2,3,1,2,3,1,2,3]) + y = array([1,1,1,2,2,2,3,3,3]) + H, xed, yed = histogram2d(x,y,[[1,2,3,5], [1,2,3,5]], normed=True) + answer=array([[1,1,.5], + [1,1,.5], + [.5,.5,.25]])/9. + assert_array_almost_equal(H, answer, 3) + if __name__ == "__main__": NumpyTest().run() diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py index 458378593..512b6bf53 100644 --- a/numpy/lib/twodim_base.py +++ b/numpy/lib/twodim_base.py @@ -2,8 +2,8 @@ """ -__all__ = ['diag','diagflat','eye','fliplr','flipud','rot90','tri','triu','tril', - 'vander','histogram2d'] +__all__ = ['diag','diagflat','eye','fliplr','flipud','rot90','tri','triu', + 'tril','vander','histogram2d'] from numpy.core.numeric import asanyarray, int_, equal, subtract, arange, \ zeros, arange, greater_equal, multiply, ones, asarray @@ -143,14 +143,21 @@ def vander(x, N=None): X[:,i] = x**(N-i-1) return X -def histogram2d(x,y, bins, normed = False): - """Compute the 2D histogram for a dataset (x,y) given the edges or - the number of bins. - - Returns histogram, xedges, yedges. - The histogram array is a count of the number of samples in each bin. - The array is oriented such that H[i,j] is the number of samples falling - into binx[j] and biny[i]. +def histogram2d(x,y, bins=10, range=None, normed=False): + """histogram2d(x,y, bins=10, range=None, normed=False) -> H, xedges, yedges + + Compute the 2D histogram from samples x,y. + + Parameters + ---------- + x,y: 1D data series. Both arrays must have the same length. + bins: Number of bins -or- [nbin x, nbin y] -or- + [bin edges] -or- [x bin edges, y bin edges]. + range: A sequence of lower and upper bin edges (default: [min, max]). + normed: True or False. + + The histogram array is a count of the number of samples in each + two dimensional bin. Setting normed to True returns a density rather than a bin count. Data falling outside of the edges are not counted. """ @@ -160,33 +167,40 @@ def histogram2d(x,y, bins, normed = False): except TypeError: N = 1 bins = [bins] + x = asarray(x) + y = asarray(y) + if range is None: + xmin, xmax = x.min(), x.max() + ymin, ymax = y.min(), y.max() + else: + xmin, xmax = range[0] + ymin, ymax = range[1] if N == 2: if np.isscalar(bins[0]): xnbin = bins[0] - xedges = np.linspace(x.min(), x.max(), xnbin+1) - + xedges = np.linspace(xmin, xmax, xnbin+1) else: xedges = asarray(bins[0], float) xnbin = len(xedges)-1 - if np.isscalar(bins[1]): ynbin = bins[1] - yedges = np.linspace(y.min(), y.max(), ynbin+1) + yedges = np.linspace(ymin, ymax, ynbin+1) else: yedges = asarray(bins[1], float) ynbin = len(yedges)-1 elif N == 1: ynbin = xnbin = bins[0] - xedges = np.linspace(x.min(), x.max(), xnbin+1) - yedges = np.linspace(y.max(), y.min(), ynbin+1) - xedges[-1] *= 1.0001 - yedges[-1] *= 1.0001 + xedges = np.linspace(xmin, xmax, xnbin+1) + yedges = np.linspace(ymin, ymax, ynbin+1) else: yedges = asarray(bins, float) xedges = yedges.copy() ynbin = len(yedges)-1 xnbin = len(xedges)-1 + dxedges = np.diff(xedges) + dyedges = np.diff(yedges) + # Flattened histogram matrix (1D) hist = np.zeros((xnbin)*(ynbin), int) @@ -194,30 +208,41 @@ def histogram2d(x,y, bins, normed = False): xbin = np.digitize(x,xedges) ybin = np.digitize(y,yedges) - # Remove the outliers + # Values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right + # edge to be counted in the last bin, and not as an outlier. + xdecimal = int(-np.log10(dxedges.min()))+6 + ydecimal = int(-np.log10(dyedges.min()))+6 + on_edge_x = np.where(np.around(x,xdecimal) == np.around(xedges[-1], xdecimal))[0] + on_edge_y = np.where(np.around(y,ydecimal) == np.around(yedges[-1], ydecimal))[0] + xbin[on_edge_x] -= 1 + ybin[on_edge_y] -= 1 + # Remove the true outliers outliers = (xbin==0) | (xbin==xnbin+1) | (ybin==0) | (ybin == ynbin+1) - - xbin = xbin[outliers==False] - ybin = ybin[outliers==False] + xbin = xbin[outliers==False] - 1 + ybin = ybin[outliers==False] - 1 # Compute the sample indices in the flattened histogram matrix. if xnbin >= ynbin: xy = ybin*(xnbin) + xbin - shift = xnbin + 1 + else: xy = xbin*(ynbin) + ybin - shift = ynbin + 1 + # Compute the number of repetitions in xy and assign it to the flattened # histogram matrix. + flatcount = np.bincount(xy) - indices = np.arange(len(flatcount)-shift) - hist[indices] = flatcount[shift:] + indices = np.arange(len(flatcount)) + hist[indices] = flatcount + shape = np.sort([xnbin, ynbin]) # Shape into a proper matrix - histmat = hist.reshape(xnbin, ynbin) - + histmat = hist.reshape(shape) + if (shape == (ynbin, xnbin)).all(): + histmat = histmat.T if normed: - diff2 = np.outer(np.diff(yedges), np.diff(xedges)) + diff2 = np.outer(dxedges, dyedges) histmat = histmat / diff2 / histmat.sum() return histmat, xedges, yedges diff --git a/numpy/oldnumeric/functions.py b/numpy/oldnumeric/functions.py index f03f2fb21..475802c88 100644 --- a/numpy/oldnumeric/functions.py +++ b/numpy/oldnumeric/functions.py @@ -119,3 +119,4 @@ def cross_product(a, b, axis1=-1, axis2=-1): def average(a, axis=0, weights=None, returned=False): return N.average(a, axis, weights, returned) + |