Merge pull request #7416 from madphysicist/hist-range

BUG: Incorrect handling of range in `histogram` with automatic bins.
author: Charles Harris <charlesr.harris@gmail.com> 2016-03-16 10:35:03 -0600
committer: Charles Harris <charlesr.harris@gmail.com> 2016-03-16 10:35:03 -0600
commit: 858b5b201f82be0aa98210dc363f049939a15e31 (patch)
tree: 839c61a8bd82416f4b943486e16b139bec8b533a /numpy
parent: 1429c606643d1ad305e710c4a31cb6f398d04c53 (diff)
parent: 8869c1ace77affefff75c8a772edb2983b68a015 (diff)
download: numpy-858b5b201f82be0aa98210dc363f049939a15e31.tar.gz
2 files changed, 114 insertions, 95 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 648eb5019..31aafda15 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -152,18 +152,20 @@ def _hist_bin_sqrt(x):
     """
     Square root histogram bin estimator.
 
-    Used by many programs for its simplicity.
+    Bin width is inversely proportional to the data size. Used by many
+    programs for its simplicity.
 
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
-    return int(np.ceil(np.sqrt(x.size)))
+    return x.ptp() / np.sqrt(x.size)
 
 
 def _hist_bin_sturges(x):
@@ -178,13 +180,14 @@ def _hist_bin_sturges(x):
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
-    return int(np.ceil(np.log2(x.size))) + 1
+    return x.ptp() / np.ceil(np.log2(x.size) + 1.0)
 
 
 def _hist_bin_rice(x):
@@ -200,13 +203,14 @@ def _hist_bin_rice(x):
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
-    return int(np.ceil(2 * x.size ** (1.0 / 3)))
+    return x.ptp() / (2.0 * x.size ** (1.0 / 3))
 
 
 def _hist_bin_scott(x):
@@ -220,16 +224,14 @@ def _hist_bin_scott(x):
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
-    h = (24 * np.pi**0.5 / x.size)**(1.0 / 3) * np.std(x)
-    if h > 0:
-        return int(np.ceil(x.ptp() / h))
-    return 1
+    return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
 
 
 def _hist_bin_doane(x):
@@ -243,16 +245,17 @@ def _hist_bin_doane(x):
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
     if x.size > 2:
         sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
         sigma = np.std(x)
-        if sigma > 0:
+        if sigma > 0.0:
             # These three operations add up to
             # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
             # but use only one temp array instead of three
@@ -260,21 +263,21 @@ def _hist_bin_doane(x):
             np.true_divide(temp, sigma, temp)
             np.power(temp, 3, temp)
             g1 = np.mean(temp)
-            return int(np.ceil(1.0 + np.log2(x.size) +
-                                     np.log2(1.0 + np.absolute(g1) / sg1)))
-    return 1
+            return x.ptp() / (1.0 + np.log2(x.size) +
+                                    np.log2(1.0 + np.absolute(g1) / sg1))
+    return 0.0
 
 
 def _hist_bin_fd(x):
     """
     The Freedman-Diaconis histogram bin estimator.
 
-    The Freedman-Diaconis rule uses interquartile range (IQR)
-    binwidth. It is considered a variation of the Scott rule with more
-    robustness as the IQR is less affected by outliers than the standard
-    deviation. However, the IQR depends on fewer points than the
-    standard deviation, so it is less accurate, especially for long
-    tailed distributions.
+    The Freedman-Diaconis rule uses interquartile range (IQR) to
+    estimate binwidth. It is considered a variation of the Scott rule
+    with more robustness as the IQR is less affected by outliers than
+    the standard deviation. However, the IQR depends on fewer points
+    than the standard deviation, so it is less accurate, especially for
+    long tailed distributions.
 
     If the IQR is 0, this function returns 1 for the number of bins.
     Binwidth is inversely proportional to the cube root of data size
@@ -283,46 +286,44 @@ def _hist_bin_fd(x):
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
     """
     iqr = np.subtract(*np.percentile(x, [75, 25]))
-
-    if iqr > 0:
-        h = (2 * iqr * x.size ** (-1.0 / 3))
-        return int(np.ceil(x.ptp() / h))
-
-    # If iqr is 0, default number of bins is 1
-    return 1
+    return 2.0 * iqr * x.size ** (-1.0 / 3.0)
 
 
 def _hist_bin_auto(x):
     """
-    Histogram bin estimator that uses the maximum of the
+    Histogram bin estimator that uses the minimum width of the
     Freedman-Diaconis and Sturges estimators.
 
-    The FD estimator is usually the most robust method, but it tends to
-    be too small for small `x`. The Sturges estimator is quite good for
-    small (<1000) datasets and is the default in the R language. This
-    method gives good off the shelf behaviour.
+    The FD estimator is usually the most robust method, but its width
+    estimate tends to be too large for small `x`. The Sturges estimator
+    is quite good for small (<1000) datasets and is the default in the R
+    language. This method gives good off the shelf behaviour.
 
     Parameters
     ----------
     x : array_like
-        Input data that is to be histogrammed.
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
 
     Returns
     -------
-    n : An estimate of the optimal bin count for the given data.
+    h : An estimate of the optimal bin width for the given data.
 
     See Also
     --------
     _hist_bin_fd, _hist_bin_sturges
     """
-    return max(_hist_bin_fd(x), _hist_bin_sturges(x))
+    # There is no need to check for zero here. If ptp is, so is IQR and
+    # vice versa. Either both are zero or neither one is.
+    return min(_hist_bin_fd(x), _hist_bin_sturges(x))
 
 
 # Private dict initialized at module load time
@@ -353,8 +354,12 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         .. versionadded:: 1.11.0
 
         If `bins` is a string from the list below, `histogram` will use
-        the method chosen to calculate the optimal number of bins (see
-        `Notes` for more detail on the estimators). For visualisation,
+        the method chosen to calculate the optimal bin width and
+        consequently the number of bins (see `Notes` for more detail on
+        the estimators) from the data that falls within the requested
+        range. While the bin width will be optimal for the actual data
+        in the range, the number of bins will be computed to fill the
+        entire range, including the empty portions. For visualisation,
         using the 'auto' option is suggested. Weighted data is not
         supported for automated bin size selection.
 
@@ -390,7 +395,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
     range : (float, float), optional
         The lower and upper range of the bins.  If not provided, range
         is simply ``(a.min(), a.max())``.  Values outside the range are
-        ignored.
+        ignored. The first element of the range must be less than or
+        equal to the second. `range` affects the automatic bin
+        computation as well. While bin width is computed to be optimal
+        based on the actual data within `range`, the bin count will fill
+        the entire range including portions containing no data.
     normed : bool, optional
         This keyword is deprecated in Numpy 1.6 due to confusing/buggy
         behavior. It will be removed in Numpy 2.0. Use the ``density``
@@ -442,13 +451,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
 
     .. versionadded:: 1.11.0
 
-    The methods to estimate the optimal number of bins are well found in
-    literature, and are inspired by the choices R provides for histogram
-    visualisation. Note that having the number of bins proportional to
-    :math:`n^{1/3}` is asymptotically optimal, which is why it appears
-    in most estimators. These are simply plug-in methods that give good
-    starting points for number of bins. In the equations below,
-    :math:`h` is the binwidth and :math:`n_h` is the number of bins.
+    The methods to estimate the optimal number of bins are well founded
+    in literature, and are inspired by the choices R provides for
+    histogram visualisation. Note that having the number of bins
+    proportional to :math:`n^{1/3}` is asymptotically optimal, which is
+    why it appears in most estimators. These are simply plug-in methods
+    that give good starting points for number of bins. In the equations
+    below, :math:`h` is the binwidth and :math:`n_h` is the number of
+    bins. All estimators that compute bin counts are recast to bin width
+    using the `ptp` of the data. The final bin count is obtained from
+    ``np.round(np.ceil(range / h))`.
 
     'Auto' (maximum of the 'Sturges' and 'FD' estimators)
         A compromise to get a good value. For small datasets the Sturges
@@ -476,14 +488,14 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         estimator in the absence of outliers.
 
     'Rice'
-        .. math:: n_h = \left\lceil 2n^{1/3} \right\rceil
+        .. math:: n_h = 2n^{1/3}
 
         The number of bins is only proportional to cube root of
         ``a.size``. It tends to overestimate the number of bins and it
         does not take into account data variability.
 
     'Sturges'
-        .. math:: n_h = \left\lceil \log _{2}n+1 \right\rceil
+        .. math:: n_h = \log _{2}n+1
 
         The number of bins is the base 2 log of ``a.size``.  This
         estimator assumes normality of data and is too conservative for
@@ -491,19 +503,19 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         ``hist`` method.
 
     'Doane'
-        .. math:: n_h = \left\lceil 1 + \log_{2}(n) +
-            \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})}
-            \right\rceil
+        .. math:: n_h = 1 + \log_{2}(n) +
+                        \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})}
 
             g_1 = mean[(\frac{x - \mu}{\sigma})^3]
 
             \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
 
         An improved version of Sturges' formula that produces better
-        estimates for non-normal datasets.
+        estimates for non-normal datasets. This estimator attempts to
+        account for the skew of the data.
 
     'Sqrt'
-        .. math:: n_h = \left\lceil \sqrt n \right\rceil
+        .. math:: n_h = \sqrt n
         The simplest and fastest estimator. Only takes into account the
         data size.
 
@@ -548,20 +560,30 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         weights = weights.ravel()
     a = a.ravel()
 
-    if (range is not None):
-        mn, mx = range
-        if (mn > mx):
-            raise ValueError(
-                'max must be larger than min in range parameter.')
-        if not np.all(np.isfinite([mn, mx])):
-            raise ValueError(
-                'range parameter must be finite.')
+    # Do not modify the original value of range so we can check for `None`
+    if range is None:
+        if a.size == 0:
+            # handle empty arrays. Can't determine range, so use 0-1.
+            mn, mx = 0.0, 1.0
+        else:
+            mn, mx = a.min() + 0.0, a.max() + 0.0
+    else:
+        mn, mx = [mi + 0.0 for mi in range]
+    if mn > mx:
+        raise ValueError(
+            'max must be larger than min in range parameter.')
+    if not np.all(np.isfinite([mn, mx])):
+        raise ValueError(
+            'range parameter must be finite.')
+    if mn == mx:
+        mn -= 0.5
+        mx += 0.5
 
     if isinstance(bins, basestring):
         # if `bins` is a string for an automatic method,
         # this will replace it with the number of bins calculated
         if bins not in _hist_bin_selectors:
-            raise ValueError("{0} not a valid estimator for `bins`".format(bins))
+            raise ValueError("{0} not a valid estimator for bins".format(bins))
         if weights is not None:
             raise TypeError("Automated estimation of the number of "
                             "bins is not supported for weighted data")
@@ -569,15 +591,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         b = a
         # Update the reference if the range needs truncation
         if range is not None:
-            mn, mx = range
             keep = (a >= mn)
             keep &= (a <= mx)
             if not np.logical_and.reduce(keep):
                 b = a[keep]
+
         if b.size == 0:
             bins = 1
         else:
-            bins = _hist_bin_selectors[bins](b)
+            # Do not call selectors on empty arrays
+            width = _hist_bin_selectors[bins](b)
+            if width:
+                bins = int(np.ceil((mx - mn) / width))
+            else:
+                # Width can be zero for some estimators, e.g. FD when
+                # the IQR of the data is zero.
+                bins = 1
 
     # Histogram is an integer or a float array depending on the weights.
     if weights is None:
@@ -593,16 +622,6 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         if np.isscalar(bins) and bins < 1:
             raise ValueError(
                 '`bins` should be a positive integer.')
-        if range is None:
-            if a.size == 0:
-                # handle empty arrays. Can't determine range, so use 0-1.
-                range = (0, 1)
-            else:
-                range = (a.min(), a.max())
-        mn, mx = [mi + 0.0 for mi in range]
-        if mn == mx:
-            mn -= 0.5
-            mx += 0.5
         # At this point, if the weights are not integer, floating point, or
         # complex, we have to use the slow algorithm.
         if weights is not None and not (np.can_cast(weights.dtype, np.double) or
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 945992fc0..20c786ad1 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1432,9 +1432,9 @@ class TestHistogramOptimBinNums(TestCase):
         for testlen, expectedResults in basic_test.items():
             # Create some sort of non uniform data to test with
             # (2 peak uniform mixture)
-            x1 = np.linspace(-10, -1, testlen/5 * 2)
-            x2 = np.linspace(1,10, testlen/5 * 3)
-            x = np.hstack((x1, x2))
+            x1 = np.linspace(-10, -1, testlen // 5 * 2)
+            x2 = np.linspace(1, 10, testlen // 5 * 3)
+            x = np.concatenate((x1, x2))
             for estimator, numbins in expectedResults.items():
                 a, b = np.histogram(x, estimator)
                 assert_equal(len(a), numbins, err_msg="For the {0} estimator "
@@ -1446,7 +1446,7 @@ class TestHistogramOptimBinNums(TestCase):
         adaptive methods, especially the FD method. All bin numbers have been
         precalculated.
         """
-        small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1,
+        small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
                          'doane': 1, 'sqrt': 1},
                      2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2,
                          'doane': 1, 'sqrt': 2},
@@ -1474,8 +1474,8 @@ class TestHistogramOptimBinNums(TestCase):
         Primarily for Scott and FD as the SD and IQR are both 0 in this case
         """
         novar_dataset = np.ones(100)
-        novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8,
-                            'doane': 1, 'sqrt': 10, 'auto': 8}
+        novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1,
+                            'doane': 1, 'sqrt': 1, 'auto': 1}
 
         for estimator, numbins in novar_resultdict.items():
             a, b = np.histogram(novar_dataset, estimator)
@@ -1510,14 +1510,14 @@ class TestHistogramOptimBinNums(TestCase):
         the shouldn't change.
         """
         # some basic sanity checking, with some fixed data. Checking for the correct number of bins
-        basic_test = {50:   {'fd': 4,  'scott': 4,  'rice': 8,  'sturges': 7,  'auto': 7},
-                      500:  {'fd': 8,  'scott': 8,  'rice': 16, 'sturges': 10, 'auto': 10},
-                      5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}
+        basic_test = {50:   {'fd': 8,  'scott': 8,  'rice': 15, 'sturges': 14, 'auto': 14},
+                      500:  {'fd': 15, 'scott': 16, 'rice': 32, 'sturges': 20, 'auto': 20},
+                      5000: {'fd': 33, 'scott': 33, 'rice': 69, 'sturges': 28, 'auto': 33}}
 
         for testlen, expectedResults in basic_test.items():
-            # create some sort of non uniform data to test with (2 peak uniform mixture)
-            x1 = np.linspace(-10, -1, testlen/5 * 2)
-            x2 = np.linspace(1, 10, testlen/5 * 3)
+            # create some sort of non uniform data to test with (3 peak uniform mixture)
+            x1 = np.linspace(-10, -1, testlen // 5 * 2)
+            x2 = np.linspace(1, 10, testlen // 5 * 3)
             x3 = np.linspace(-100, -50, testlen)
             x = np.hstack((x1, x2, x3))
             for estimator, numbins in expectedResults.items():
author	Charles Harris <charlesr.harris@gmail.com>	2016-03-16 10:35:03 -0600
committer	Charles Harris <charlesr.harris@gmail.com>	2016-03-16 10:35:03 -0600
commit	858b5b201f82be0aa98210dc363f049939a15e31 (patch)
tree	839c61a8bd82416f4b943486e16b139bec8b533a /numpy
parent	1429c606643d1ad305e710c4a31cb6f398d04c53 (diff)
parent	8869c1ace77affefff75c8a772edb2983b68a015 (diff)
download	numpy-858b5b201f82be0aa98210dc363f049939a15e31.tar.gz