diff options
Diffstat (limited to 'numpy/lib')
-rw-r--r-- | numpy/lib/function_base.py | 185 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 24 |
2 files changed, 114 insertions, 95 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 648eb5019..31aafda15 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -152,18 +152,20 @@ def _hist_bin_sqrt(x): """ Square root histogram bin estimator. - Used by many programs for its simplicity. + Bin width is inversely proportional to the data size. Used by many + programs for its simplicity. Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ - return int(np.ceil(np.sqrt(x.size))) + return x.ptp() / np.sqrt(x.size) def _hist_bin_sturges(x): @@ -178,13 +180,14 @@ def _hist_bin_sturges(x): Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ - return int(np.ceil(np.log2(x.size))) + 1 + return x.ptp() / np.ceil(np.log2(x.size) + 1.0) def _hist_bin_rice(x): @@ -200,13 +203,14 @@ def _hist_bin_rice(x): Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ - return int(np.ceil(2 * x.size ** (1.0 / 3))) + return x.ptp() / (2.0 * x.size ** (1.0 / 3)) def _hist_bin_scott(x): @@ -220,16 +224,14 @@ def _hist_bin_scott(x): Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ - h = (24 * np.pi**0.5 / x.size)**(1.0 / 3) * np.std(x) - if h > 0: - return int(np.ceil(x.ptp() / h)) - return 1 + return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) def _hist_bin_doane(x): @@ -243,16 +245,17 @@ def _hist_bin_doane(x): Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ if x.size > 2: sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) sigma = np.std(x) - if sigma > 0: + if sigma > 0.0: # These three operations add up to # g1 = np.mean(((x - np.mean(x)) / sigma)**3) # but use only one temp array instead of three @@ -260,21 +263,21 @@ def _hist_bin_doane(x): np.true_divide(temp, sigma, temp) np.power(temp, 3, temp) g1 = np.mean(temp) - return int(np.ceil(1.0 + np.log2(x.size) + - np.log2(1.0 + np.absolute(g1) / sg1))) - return 1 + return x.ptp() / (1.0 + np.log2(x.size) + + np.log2(1.0 + np.absolute(g1) / sg1)) + return 0.0 def _hist_bin_fd(x): """ The Freedman-Diaconis histogram bin estimator. - The Freedman-Diaconis rule uses interquartile range (IQR) - binwidth. It is considered a variation of the Scott rule with more - robustness as the IQR is less affected by outliers than the standard - deviation. However, the IQR depends on fewer points than the - standard deviation, so it is less accurate, especially for long - tailed distributions. + The Freedman-Diaconis rule uses interquartile range (IQR) to + estimate binwidth. It is considered a variation of the Scott rule + with more robustness as the IQR is less affected by outliers than + the standard deviation. However, the IQR depends on fewer points + than the standard deviation, so it is less accurate, especially for + long tailed distributions. If the IQR is 0, this function returns 1 for the number of bins. Binwidth is inversely proportional to the cube root of data size @@ -283,46 +286,44 @@ def _hist_bin_fd(x): Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. """ iqr = np.subtract(*np.percentile(x, [75, 25])) - - if iqr > 0: - h = (2 * iqr * x.size ** (-1.0 / 3)) - return int(np.ceil(x.ptp() / h)) - - # If iqr is 0, default number of bins is 1 - return 1 + return 2.0 * iqr * x.size ** (-1.0 / 3.0) def _hist_bin_auto(x): """ - Histogram bin estimator that uses the maximum of the + Histogram bin estimator that uses the minimum width of the Freedman-Diaconis and Sturges estimators. - The FD estimator is usually the most robust method, but it tends to - be too small for small `x`. The Sturges estimator is quite good for - small (<1000) datasets and is the default in the R language. This - method gives good off the shelf behaviour. + The FD estimator is usually the most robust method, but its width + estimate tends to be too large for small `x`. The Sturges estimator + is quite good for small (<1000) datasets and is the default in the R + language. This method gives good off the shelf behaviour. Parameters ---------- x : array_like - Input data that is to be histogrammed. + Input data that is to be histogrammed, trimmed to range. May not + be empty. Returns ------- - n : An estimate of the optimal bin count for the given data. + h : An estimate of the optimal bin width for the given data. See Also -------- _hist_bin_fd, _hist_bin_sturges """ - return max(_hist_bin_fd(x), _hist_bin_sturges(x)) + # There is no need to check for zero here. If ptp is, so is IQR and + # vice versa. Either both are zero or neither one is. + return min(_hist_bin_fd(x), _hist_bin_sturges(x)) # Private dict initialized at module load time @@ -353,8 +354,12 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, .. versionadded:: 1.11.0 If `bins` is a string from the list below, `histogram` will use - the method chosen to calculate the optimal number of bins (see - `Notes` for more detail on the estimators). For visualisation, + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, using the 'auto' option is suggested. Weighted data is not supported for automated bin size selection. @@ -390,7 +395,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are - ignored. + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. normed : bool, optional This keyword is deprecated in Numpy 1.6 due to confusing/buggy behavior. It will be removed in Numpy 2.0. Use the ``density`` @@ -442,13 +451,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, .. versionadded:: 1.11.0 - The methods to estimate the optimal number of bins are well found in - literature, and are inspired by the choices R provides for histogram - visualisation. Note that having the number of bins proportional to - :math:`n^{1/3}` is asymptotically optimal, which is why it appears - in most estimators. These are simply plug-in methods that give good - starting points for number of bins. In the equations below, - :math:`h` is the binwidth and :math:`n_h` is the number of bins. + The methods to estimate the optimal number of bins are well founded + in literature, and are inspired by the choices R provides for + histogram visualisation. Note that having the number of bins + proportional to :math:`n^{1/3}` is asymptotically optimal, which is + why it appears in most estimators. These are simply plug-in methods + that give good starting points for number of bins. In the equations + below, :math:`h` is the binwidth and :math:`n_h` is the number of + bins. All estimators that compute bin counts are recast to bin width + using the `ptp` of the data. The final bin count is obtained from + ``np.round(np.ceil(range / h))`. 'Auto' (maximum of the 'Sturges' and 'FD' estimators) A compromise to get a good value. For small datasets the Sturges @@ -476,14 +488,14 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, estimator in the absence of outliers. 'Rice' - .. math:: n_h = \left\lceil 2n^{1/3} \right\rceil + .. math:: n_h = 2n^{1/3} The number of bins is only proportional to cube root of ``a.size``. It tends to overestimate the number of bins and it does not take into account data variability. 'Sturges' - .. math:: n_h = \left\lceil \log _{2}n+1 \right\rceil + .. math:: n_h = \log _{2}n+1 The number of bins is the base 2 log of ``a.size``. This estimator assumes normality of data and is too conservative for @@ -491,19 +503,19 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, ``hist`` method. 'Doane' - .. math:: n_h = \left\lceil 1 + \log_{2}(n) + - \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})} - \right\rceil + .. math:: n_h = 1 + \log_{2}(n) + + \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})} g_1 = mean[(\frac{x - \mu}{\sigma})^3] \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} An improved version of Sturges' formula that produces better - estimates for non-normal datasets. + estimates for non-normal datasets. This estimator attempts to + account for the skew of the data. 'Sqrt' - .. math:: n_h = \left\lceil \sqrt n \right\rceil + .. math:: n_h = \sqrt n The simplest and fastest estimator. Only takes into account the data size. @@ -548,20 +560,30 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, weights = weights.ravel() a = a.ravel() - if (range is not None): - mn, mx = range - if (mn > mx): - raise ValueError( - 'max must be larger than min in range parameter.') - if not np.all(np.isfinite([mn, mx])): - raise ValueError( - 'range parameter must be finite.') + # Do not modify the original value of range so we can check for `None` + if range is None: + if a.size == 0: + # handle empty arrays. Can't determine range, so use 0-1. + mn, mx = 0.0, 1.0 + else: + mn, mx = a.min() + 0.0, a.max() + 0.0 + else: + mn, mx = [mi + 0.0 for mi in range] + if mn > mx: + raise ValueError( + 'max must be larger than min in range parameter.') + if not np.all(np.isfinite([mn, mx])): + raise ValueError( + 'range parameter must be finite.') + if mn == mx: + mn -= 0.5 + mx += 0.5 if isinstance(bins, basestring): # if `bins` is a string for an automatic method, # this will replace it with the number of bins calculated if bins not in _hist_bin_selectors: - raise ValueError("{0} not a valid estimator for `bins`".format(bins)) + raise ValueError("{0} not a valid estimator for bins".format(bins)) if weights is not None: raise TypeError("Automated estimation of the number of " "bins is not supported for weighted data") @@ -569,15 +591,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, b = a # Update the reference if the range needs truncation if range is not None: - mn, mx = range keep = (a >= mn) keep &= (a <= mx) if not np.logical_and.reduce(keep): b = a[keep] + if b.size == 0: bins = 1 else: - bins = _hist_bin_selectors[bins](b) + # Do not call selectors on empty arrays + width = _hist_bin_selectors[bins](b) + if width: + bins = int(np.ceil((mx - mn) / width)) + else: + # Width can be zero for some estimators, e.g. FD when + # the IQR of the data is zero. + bins = 1 # Histogram is an integer or a float array depending on the weights. if weights is None: @@ -593,16 +622,6 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, if np.isscalar(bins) and bins < 1: raise ValueError( '`bins` should be a positive integer.') - if range is None: - if a.size == 0: - # handle empty arrays. Can't determine range, so use 0-1. - range = (0, 1) - else: - range = (a.min(), a.max()) - mn, mx = [mi + 0.0 for mi in range] - if mn == mx: - mn -= 0.5 - mx += 0.5 # At this point, if the weights are not integer, floating point, or # complex, we have to use the slow algorithm. if weights is not None and not (np.can_cast(weights.dtype, np.double) or diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 945992fc0..20c786ad1 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1432,9 +1432,9 @@ class TestHistogramOptimBinNums(TestCase): for testlen, expectedResults in basic_test.items(): # Create some sort of non uniform data to test with # (2 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen/5 * 2) - x2 = np.linspace(1,10, testlen/5 * 3) - x = np.hstack((x1, x2)) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x = np.concatenate((x1, x2)) for estimator, numbins in expectedResults.items(): a, b = np.histogram(x, estimator) assert_equal(len(a), numbins, err_msg="For the {0} estimator " @@ -1446,7 +1446,7 @@ class TestHistogramOptimBinNums(TestCase): adaptive methods, especially the FD method. All bin numbers have been precalculated. """ - small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1, + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, 'doane': 1, 'sqrt': 1}, 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, 'doane': 1, 'sqrt': 2}, @@ -1474,8 +1474,8 @@ class TestHistogramOptimBinNums(TestCase): Primarily for Scott and FD as the SD and IQR are both 0 in this case """ novar_dataset = np.ones(100) - novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, - 'doane': 1, 'sqrt': 10, 'auto': 8} + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1, 'auto': 1} for estimator, numbins in novar_resultdict.items(): a, b = np.histogram(novar_dataset, estimator) @@ -1510,14 +1510,14 @@ class TestHistogramOptimBinNums(TestCase): the shouldn't change. """ # some basic sanity checking, with some fixed data. Checking for the correct number of bins - basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7}, - 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10}, - 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}} + basic_test = {50: {'fd': 8, 'scott': 8, 'rice': 15, 'sturges': 14, 'auto': 14}, + 500: {'fd': 15, 'scott': 16, 'rice': 32, 'sturges': 20, 'auto': 20}, + 5000: {'fd': 33, 'scott': 33, 'rice': 69, 'sturges': 28, 'auto': 33}} for testlen, expectedResults in basic_test.items(): - # create some sort of non uniform data to test with (2 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen/5 * 2) - x2 = np.linspace(1, 10, testlen/5 * 3) + # create some sort of non uniform data to test with (3 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) x3 = np.linspace(-100, -50, testlen) x = np.hstack((x1, x2, x3)) for estimator, numbins in expectedResults.items(): |