diff options
author | Joseph Fox-Rabinovitz <joseph.r.fox-rabinovitz@nasa.gov> | 2016-02-01 16:29:48 -0500 |
---|---|---|
committer | Joseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com> | 2016-02-11 20:06:30 -0500 |
commit | b8b55614a3d3c2e3e2c653064719de6906c1be39 (patch) | |
tree | d7e56a9618ead489145eaf54047349d59cfec9b9 /numpy/lib/function_base.py | |
parent | 47b6c2b8bacb510cac62d490c159ec116080d1d0 (diff) | |
download | numpy-b8b55614a3d3c2e3e2c653064719de6906c1be39.tar.gz |
Added 'doane' and 'sqrt' estimators to np.histogram in numpy.function_base
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r-- | numpy/lib/function_base.py | 246 |
1 files changed, 163 insertions, 83 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index b8e017eab..521694506 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -78,56 +78,100 @@ def iterable(y): def _hist_optim_numbins_estimator(a, estimator): """ - A helper function to be called from histogram to deal with estimating optimal number of bins + A helper function to be called from ``histogram`` to deal with + estimating optimal number of bins. + + A description of the estimators can be found at + https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width estimator: str - If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function - will choose the appropriate estimator and return it's estimate for the optimal - number of bins. + If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane', + 'rice', 'sturges', 'sqrt'], this function will choose the + appropriate estimation method and return the optimal number of + bins it calculates. """ if a.size == 0: return 1 + def sqrt(x): + """ + Square Root Estimator + + Used by many programs for its simplicity. + """ + return np.ceil(np.sqrt(x.size)) + def sturges(x): """ Sturges Estimator - A very simplistic estimator based on the assumption of normality of the data - Poor performance for non-normal data, especially obvious for large X. - Depends only on size of the data. + + A very simplistic estimator based on the assumption of normality + of the data. Poor performance for non-normal data, especially + obvious for large ``x``. Depends only on size of the data. """ return np.ceil(np.log2(x.size)) + 1 def rice(x): """ Rice Estimator - Another simple estimator, with no normality assumption. - It has better performance for large data, but tends to overestimate number of bins. - The number of bins is proportional to the cube root of data size (asymptotically optimal) - Depends only on size of the data + + Another simple estimator, with no normality assumption. It has + better performance for large data, but tends to overestimate + number of bins. The number of bins is proportional to the cube + root of data size (asymptotically optimal). Depends only on size + of the data. """ return np.ceil(2 * x.size ** (1.0 / 3)) def scott(x): """ Scott Estimator - The binwidth is proportional to the standard deviation of the data and - inversely proportional to the cube root of data size (asymptotically optimal) + The binwidth is proportional to the standard deviation of the + data and inversely proportional to the cube root of data size + (asymptotically optimal). """ - h = 3.5 * x.std() * x.size ** (-1.0 / 3) + h = (24 * np.pi**0.5 / x.size)**(1.0 / 3) * np.std(x) if h > 0: return np.ceil(x.ptp() / h) return 1 + def doane(x): + """ + Doane's Estimator + + Improved version of Sturges' formula which works better for + non-normal data. See + http://stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning + """ + if x.size > 2: + sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) + sigma = np.std(x) + if sigma > 0: + # These three operations add up to + # g1 = np.mean(((x - np.mean(x)) / sigma)**3) + # but use only one temp array instead of three + temp = x - np.mean(x) + np.true_divide(temp, sigma, temp) + np.power(temp, 3, temp) + g1 = np.mean(temp) + return np.ceil(1.0 + np.log2(x.size) + + np.log2(1.0 + np.absolute(g1) / sg1)) + return 1 + def fd(x): """ - Freedman Diaconis rule using interquartile range (IQR) for binwidth - Considered a variation of the Scott rule with more robustness as the IQR - is less affected by outliers than the standard deviation. However the IQR depends on - fewer points than the sd so it is less accurate, especially for long tailed distributions. + Freedman Diaconis Estimator - If the IQR is 0, we return 1 for the number of bins. - Binwidth is inversely proportional to the cube root of data size (asymptotically optimal) + The interquartile range (IQR) is used for binwidth, making this + variation of the Scott rule more robust, as the IQR is less + affected by outliers than the standard deviation. However, the + IQR depends on fewer points than the standard deviation, so it + is less accurate, especially for long tailed distributions. + + If the IQR is 0, we return 1 for the number of bins. Binwidth is + inversely proportional to the cube root of data size + (asymptotically optimal). """ iqr = np.subtract(*np.percentile(x, [75, 25])) @@ -140,14 +184,15 @@ def _hist_optim_numbins_estimator(a, estimator): def auto(x): """ - The FD estimator is usually the most robust method, but it tends to be too small - for small X. The Sturges estimator is quite good for small (<1000) datasets and is - the default in R. - This method gives good off the shelf behaviour. + The FD estimator is usually the most robust method, but it tends + to be too small for small ``x``. The Sturges estimator is quite + good for small (<1000) datasets and is the default in R. This + method gives good off-the-shelf behaviour. """ return max(fd(x), sturges(x)) - optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott, + optimal_numbins_methods = {'sqrt': sqrt, 'sturges': sturges, + 'rice': rice, 'scott': scott, 'doane': doane, 'fd': fd, 'auto': auto} try: estimator_func = optimal_numbins_methods[estimator.lower()] @@ -169,34 +214,45 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, Input data. The histogram is computed over the flattened array. bins : int or sequence of scalars or str, optional If `bins` is an int, it defines the number of equal-width - bins in the given range (10, by default). If `bins` is a sequence, - it defines the bin edges, including the rightmost edge, allowing - for non-uniform bin widths. + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. .. versionadded:: 1.11.0 - If `bins` is a string from the list below, `histogram` will use the method - chosen to calculate the optimal number of bins (see Notes for more detail - on the estimators). For visualisation, we suggest using the 'auto' option. + If `bins` is a string from the list below, `histogram` will use + the method chosen to calculate the optimal number of bins (see + Notes for more detail on the estimators). For visualisation, we + suggest using the 'auto' option. 'auto' - Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance + Maximum of the 'sturges' and 'fd' estimators. Provides good + all round performance 'fd' (Freedman Diaconis Estimator) - Robust (resilient to outliers) estimator that takes into account data - variability and data size . + Robust (resilient to outliers) estimator that takes into + account data variability and data size . + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. 'scott' Less robust estimator that that takes into account data variability and data size. 'rice' - Estimator does not take variability into account, only data size. - Commonly overestimates number of bins required. + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. 'sturges' - R's default method, only accounts for data size. Only optimal for - gaussian data and underestimates number of bins for large non-gaussian datasets. + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. range : (float, float), optional The lower and upper range of the bins. If not provided, range @@ -204,31 +260,33 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, ignored. normed : bool, optional This keyword is deprecated in Numpy 1.6 due to confusing/buggy - behavior. It will be removed in Numpy 2.0. Use the density keyword - instead. - If False, the result will contain the number of samples - in each bin. If True, the result is the value of the - probability *density* function at the bin, normalized such that - the *integral* over the range is 1. Note that this latter behavior is - known to be buggy with unequal bin widths; use `density` instead. + behavior. It will be removed in Numpy 2.0. Use the ``density`` + keyword instead. If ``False``, the result will contain the + number of samples in each bin. If ``True``, the result is the + value of the probability *density* function at the bin, + normalized such that the *integral* over the range is 1. Note + that this latter behavior is known to be buggy with unequal bin + widths; use ``density`` instead. weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in `a` - only contributes its associated weight towards the bin count - (instead of 1). If `normed` is True, the weights are normalized, - so that the integral of the density over the range remains 1 + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. density : bool, optional - If False, the result will contain the number of samples - in each bin. If True, the result is the value of the + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. - Overrides the `normed` keyword if given. + + Overrides the ``normed`` keyword if given. Returns ------- hist : array - The values of the histogram. See `normed` and `weights` for a + The values of the histogram. See `density` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. @@ -240,56 +298,77 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, Notes ----- - All but the last (righthand-most) bin is half-open. In other words, if - `bins` is:: + All but the last (righthand-most) bin is half-open. In other words, + if `bins` is:: [1, 2, 3, 4] - then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the - second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* - 4. + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and + the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which + *includes* 4. .. versionadded:: 1.11.0 - The methods to estimate the optimal number of bins are well found in literature, - and are inspired by the choices R provides for histogram visualisation. - Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal, - which is why it appears in most estimators. - These are simply plug-in methods that give good starting points for number of bins. - In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins + The methods to estimate the optimal number of bins are well found in + literature, and are inspired by the choices R provides for histogram + visualisation. Note that having the number of bins proportional to + :math:`n^{1/3}` is asymptotically optimal, which is why it appears + in most estimators. These are simply plug-in methods that give good + starting points for number of bins. In the equations below, + :math:`h` is the binwidth and :math:`n_h` is the number of bins. 'Auto' (maximum of the 'Sturges' and 'FD' estimators) - A compromise to get a good value. For small datasets the sturges - value will usually be chosen, while larger datasets will usually default to FD. - Avoids the overly conservative behaviour of FD and Sturges for small and - large datasets respectively. Switchover point is usually x.size~1000. + A compromise to get a good value. For small datasets the Sturges + value will usually be chosen, while larger datasets will usually + default to FD. Avoids the overly conservative behaviour of FD + and Sturges for small and large datasets respectively. + Switchover point usually happens when ``x.size`` is around 1000. 'FD' (Freedman Diaconis Estimator) .. math:: h = 2 \\frac{IQR}{n^{1/3}} The binwidth is proportional to the interquartile range (IQR) and inversely proportional to cube root of a.size. Can be too - conservative for small datasets, but is quite good - for large datasets. The IQR is very robust to outliers. + conservative for small datasets, but is quite good for large + datasets. The IQR is very robust to outliers. 'Scott' - .. math:: h = \\frac{3.5\\sigma}{n^{1/3}} - The binwidth is proportional to the standard deviation (sd) of the data - and inversely proportional to cube root of a.size. Can be too - conservative for small datasets, but is quite good - for large datasets. The sd is not very robust to outliers. Values - are very similar to the Freedman Diaconis Estimator in the absence of outliers. + .. math:: h = \\sigma \\sqrt[3]{\\frac{24 * \\sqrt{\\pi}}{n}} + The binwidth is proportional to the standard deviation of the + data and inversely proportional to cube root of ``x.size``. Can + be too conservative for small datasets, but is quite good for + large datasets. The standard deviation is not very robust to + outliers. Values are very similar to the Freedman-Diaconis + estimator in the absence of outliers. 'Rice' .. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil - The number of bins is only proportional to cube root of a.size. - It tends to overestimate the number of bins - and it does not take into account data variability. + The number of bins is only proportional to cube root of + ``a.size``. It tends to overestimate the number of bins and it + does not take into account data variability. 'Sturges' .. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil - The number of bins is the base2 log of a.size. - This estimator assumes normality of data and is too conservative for larger, - non-normal datasets. This is the default method in R's `hist` method. + The number of bins is the base 2 log of ``a.size``. This + estimator assumes normality of data and is too conservative for + larger, non-normal datasets. This is the default method in R's + ``hist`` method. + + 'Doane' + .. math:: n_h = \\left\\lceil 1 + \\log_{2}(n) + + \\log_{2}(1 + \\frac{\\left g_1 \\right}{\\sigma_{g_1})} + \\right\\rceil + + g_1 = mean[(\\frac{x - \\mu}{\\sigma})^3] + + \\sigma_{g_1} = \\sqrt{\\frac{6(n - 2)}{(n + 1)(n + 3)}} + + An improved version of Sturges' formula that produces better + estimates for non-normal datasets. + + 'Sqrt' + .. math:: n_h = \\left\\lceil \\sqrt n \\right\\rceil + The simplest and fastest estimator. Only takes into account the + data size. Examples -------- @@ -311,7 +390,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, .. versionadded:: 1.11.0 - Automated Bin Selection Methods example, using 2 peak random data with 2000 points + Automated Bin Selection Methods example, using 2 peak random data + with 2000 points: >>> import matplotlib.pyplot as plt >>> rng = np.random.RandomState(10) # deterministic random data |