ENH: Adding support to the range keyword for estimation of the optimal number of bins and associated tests

author: Varun Nayyar <nayyarv@gmail.com> 2015-09-22 19:38:34 +1000
committer: Joseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com> 2016-02-13 23:15:28 -0500
commit: 62bb0cb9681f638b56dc29b970228cb36c2104b6 (patch)
tree: 10dd5fd6834cbeb16638efa90ee0e8436fec5117
parent: 26af0ce08f2c6660d8623446d79cb0569f20c2f6 (diff)
download: numpy-62bb0cb9681f638b56dc29b970228cb36c2104b6.tar.gz
2 files changed, 54 insertions, 3 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 06d1ee4a7..66213c5e0 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -76,7 +76,7 @@ def iterable(y):
     return True
 
 
-def _hist_optim_numbins_estimator(a, estimator):
+def _hist_optim_numbins_estimator(a, estimator, data_range=None, data_weights=None):
     """
     A helper function to be called from ``histogram`` to deal with
     estimating optimal number of bins.
@@ -84,15 +84,34 @@ def _hist_optim_numbins_estimator(a, estimator):
     A description of the estimators can be found at
     https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
 
+    Parameters
+    ----------
+    a : array_like
+        The data with which to estimate the number of bins
     estimator: str
         If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane',
         'rice', 'sturges', 'sqrt'], this function will choose the
         appropriate estimation method and return the optimal number of
         bins it calculates.
+    data_range: tuple (min, max)
+        The range that the data to be binned should be restricted to.
+    data_weights:
+        weights are not supported, so this field must be empty or None.
     """
     if a.size == 0:
         return 1
 
+    if data_weights is not None:
+        raise TypeError("Automated estimation of the number of "
+                        "bins is not supported for weighted data")
+
+    if data_range is not None:
+        mn, mx = data_range
+        keep = (a >= mn)
+        keep &= (a <= mx)
+        if not np.logical_and.reduce(keep):
+            a = a[keep]
+
     def sqrt(x):
         """
         Square Root Estimator
@@ -223,7 +242,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         If `bins` is a string from the list below, `histogram` will use
         the method chosen to calculate the optimal number of bins (see
         Notes for more detail on the estimators). For visualisation, we
-        suggest using the 'auto' option.
+        suggest using the 'auto' option. Weighted data is not supported
+        for automated bin size selection.
 
         'auto'
             Maximum of the 'sturges' and 'fd' estimators. Provides good
@@ -426,7 +446,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
 
 
     if isinstance(bins, basestring):
-        bins = _hist_optim_numbins_estimator(a, bins)
+        bins = _hist_optim_numbins_estimator(a, bins, range, weights)
         # if `bins` is a string for an automatic method,
         # this will replace it with the number of bins calculated
 
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 00d9f36c8..cd126fe71 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1381,6 +1381,37 @@ class TestHistogramOptimBinNums(TestCase):
             a, b = np.histogram(outlier_dataset, estimator)
             assert_equal(len(a), numbins)
 
+    def test_simple_range(self):
+        """
+        Straightforward testing with a mixture of linspace data (for
+        consistency). Adding in a 3rd mixture that will then be
+        completely ignored. All test values have been precomputed and
+        the shouldn't change.
+        """
+        # some basic sanity checking, with some fixed data. Checking for the correct number of bins
+        basic_test = {50:   {'fd': 4,  'scott': 4,  'rice': 8,  'sturges': 7,  'auto': 7},
+                      500:  {'fd': 8,  'scott': 8,  'rice': 16, 'sturges': 10, 'auto': 10},
+                      5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}
+
+        for testlen, expectedResults in basic_test.items():
+            # create some sort of non uniform data to test with (2 peak uniform mixture)
+            x1 = np.linspace(-10, -1, testlen/5 * 2)
+            x2 = np.linspace(1, 10, testlen/5 * 3)
+            x3 = np.linspace(-100, -50, testlen)
+            x = np.hstack((x1, x2, x3))
+            for estimator, numbins in expectedResults.items():
+                a, b = np.histogram(x, estimator, range = (-20, 20))
+                msg = "For the {0} estimator with datasize of {1}".format(estimator, testlen)
+                assert_equal(len(a), numbins, err_msg=msg)
+
+    def test_simple_weighted(self):
+        """
+        Check that weighted data raises a TypeError
+        """
+        estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
+        for estimator in estimator_list:
+            assert_raises(TypeError, histogram, [1, 2, 3], estimator, weights=[1, 2, 3])
+
 
 class TestHistogramdd(TestCase):
author	Varun Nayyar <nayyarv@gmail.com>	2015-09-22 19:38:34 +1000
committer	Joseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com>	2016-02-13 23:15:28 -0500
commit	62bb0cb9681f638b56dc29b970228cb36c2104b6 (patch)
tree	10dd5fd6834cbeb16638efa90ee0e8436fec5117
parent	26af0ce08f2c6660d8623446d79cb0569f20c2f6 (diff)
download	numpy-62bb0cb9681f638b56dc29b970228cb36c2104b6.tar.gz