diff options
author | Varun Nayyar <nayyarv@gmail.com> | 2015-09-22 19:38:34 +1000 |
---|---|---|
committer | Joseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com> | 2016-02-13 23:15:28 -0500 |
commit | 62bb0cb9681f638b56dc29b970228cb36c2104b6 (patch) | |
tree | 10dd5fd6834cbeb16638efa90ee0e8436fec5117 | |
parent | 26af0ce08f2c6660d8623446d79cb0569f20c2f6 (diff) | |
download | numpy-62bb0cb9681f638b56dc29b970228cb36c2104b6.tar.gz |
ENH: Adding support to the range keyword for estimation of the optimal number of bins and associated tests
-rw-r--r-- | numpy/lib/function_base.py | 26 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 31 |
2 files changed, 54 insertions, 3 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 06d1ee4a7..66213c5e0 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -76,7 +76,7 @@ def iterable(y): return True -def _hist_optim_numbins_estimator(a, estimator): +def _hist_optim_numbins_estimator(a, estimator, data_range=None, data_weights=None): """ A helper function to be called from ``histogram`` to deal with estimating optimal number of bins. @@ -84,15 +84,34 @@ def _hist_optim_numbins_estimator(a, estimator): A description of the estimators can be found at https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width + Parameters + ---------- + a : array_like + The data with which to estimate the number of bins estimator: str If ``estimator`` is one of ['auto', 'fd', 'scott', 'doane', 'rice', 'sturges', 'sqrt'], this function will choose the appropriate estimation method and return the optimal number of bins it calculates. + data_range: tuple (min, max) + The range that the data to be binned should be restricted to. + data_weights: + weights are not supported, so this field must be empty or None. """ if a.size == 0: return 1 + if data_weights is not None: + raise TypeError("Automated estimation of the number of " + "bins is not supported for weighted data") + + if data_range is not None: + mn, mx = data_range + keep = (a >= mn) + keep &= (a <= mx) + if not np.logical_and.reduce(keep): + a = a[keep] + def sqrt(x): """ Square Root Estimator @@ -223,7 +242,8 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, If `bins` is a string from the list below, `histogram` will use the method chosen to calculate the optimal number of bins (see Notes for more detail on the estimators). For visualisation, we - suggest using the 'auto' option. + suggest using the 'auto' option. Weighted data is not supported + for automated bin size selection. 'auto' Maximum of the 'sturges' and 'fd' estimators. Provides good @@ -426,7 +446,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, if isinstance(bins, basestring): - bins = _hist_optim_numbins_estimator(a, bins) + bins = _hist_optim_numbins_estimator(a, bins, range, weights) # if `bins` is a string for an automatic method, # this will replace it with the number of bins calculated diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 00d9f36c8..cd126fe71 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1381,6 +1381,37 @@ class TestHistogramOptimBinNums(TestCase): a, b = np.histogram(outlier_dataset, estimator) assert_equal(len(a), numbins) + def test_simple_range(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). Adding in a 3rd mixture that will then be + completely ignored. All test values have been precomputed and + the shouldn't change. + """ + # some basic sanity checking, with some fixed data. Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen/5 * 2) + x2 = np.linspace(1, 10, testlen/5 * 3) + x3 = np.linspace(-100, -50, testlen) + x = np.hstack((x1, x2, x3)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator, range = (-20, 20)) + msg = "For the {0} estimator with datasize of {1}".format(estimator, testlen) + assert_equal(len(a), numbins, err_msg=msg) + + def test_simple_weighted(self): + """ + Check that weighted data raises a TypeError + """ + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + for estimator in estimator_list: + assert_raises(TypeError, histogram, [1, 2, 3], estimator, weights=[1, 2, 3]) + class TestHistogramdd(TestCase): |