summaryrefslogtreecommitdiff
path: root/numpy/lib
diff options
context:
space:
mode:
authorJoseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com>2016-03-14 23:06:46 -0400
committerJoseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com>2016-03-16 05:49:37 -0400
commit8869c1ace77affefff75c8a772edb2983b68a015 (patch)
tree839c61a8bd82416f4b943486e16b139bec8b533a /numpy/lib
parent127eb9e7a4fb79668e62d1a50cf428fb7e7bf18e (diff)
downloadnumpy-8869c1ace77affefff75c8a772edb2983b68a015.tar.gz
DOC: Updated documentation to reflect changes to bin estimators.
Described ad nauseum the relationship between `range` parameter and bin estimation. Updated formulas for estimators now that they are returning bin widths.
Diffstat (limited to 'numpy/lib')
-rw-r--r--numpy/lib/function_base.py66
1 files changed, 40 insertions, 26 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 4a3aeba7e..31aafda15 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -152,7 +152,8 @@ def _hist_bin_sqrt(x):
"""
Square root histogram bin estimator.
- Used by many programs for its simplicity.
+ Bin width is inversely proportional to the data size. Used by many
+ programs for its simplicity.
Parameters
----------
@@ -162,7 +163,7 @@ def _hist_bin_sqrt(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
return x.ptp() / np.sqrt(x.size)
@@ -184,7 +185,7 @@ def _hist_bin_sturges(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
return x.ptp() / np.ceil(np.log2(x.size) + 1.0)
@@ -207,7 +208,7 @@ def _hist_bin_rice(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
return x.ptp() / (2.0 * x.size ** (1.0 / 3))
@@ -228,7 +229,7 @@ def _hist_bin_scott(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
@@ -249,7 +250,7 @@ def _hist_bin_doane(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
if x.size > 2:
sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
@@ -290,7 +291,7 @@ def _hist_bin_fd(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
"""
iqr = np.subtract(*np.percentile(x, [75, 25]))
return 2.0 * iqr * x.size ** (-1.0 / 3.0)
@@ -314,12 +315,14 @@ def _hist_bin_auto(x):
Returns
-------
- w : An estimate of the optimal bin width for the given data.
+ h : An estimate of the optimal bin width for the given data.
See Also
--------
_hist_bin_fd, _hist_bin_sturges
"""
+ # There is no need to check for zero here. If ptp is, so is IQR and
+ # vice versa. Either both are zero or neither one is.
return min(_hist_bin_fd(x), _hist_bin_sturges(x))
@@ -351,8 +354,12 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
.. versionadded:: 1.11.0
If `bins` is a string from the list below, `histogram` will use
- the method chosen to calculate the optimal number of bins (see
- `Notes` for more detail on the estimators). For visualisation,
+ the method chosen to calculate the optimal bin width and
+ consequently the number of bins (see `Notes` for more detail on
+ the estimators) from the data that falls within the requested
+ range. While the bin width will be optimal for the actual data
+ in the range, the number of bins will be computed to fill the
+ entire range, including the empty portions. For visualisation,
using the 'auto' option is suggested. Weighted data is not
supported for automated bin size selection.
@@ -388,7 +395,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
range : (float, float), optional
The lower and upper range of the bins. If not provided, range
is simply ``(a.min(), a.max())``. Values outside the range are
- ignored.
+ ignored. The first element of the range must be less than or
+ equal to the second. `range` affects the automatic bin
+ computation as well. While bin width is computed to be optimal
+ based on the actual data within `range`, the bin count will fill
+ the entire range including portions containing no data.
normed : bool, optional
This keyword is deprecated in Numpy 1.6 due to confusing/buggy
behavior. It will be removed in Numpy 2.0. Use the ``density``
@@ -440,13 +451,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
.. versionadded:: 1.11.0
- The methods to estimate the optimal number of bins are well found in
- literature, and are inspired by the choices R provides for histogram
- visualisation. Note that having the number of bins proportional to
- :math:`n^{1/3}` is asymptotically optimal, which is why it appears
- in most estimators. These are simply plug-in methods that give good
- starting points for number of bins. In the equations below,
- :math:`h` is the binwidth and :math:`n_h` is the number of bins.
+ The methods to estimate the optimal number of bins are well founded
+ in literature, and are inspired by the choices R provides for
+ histogram visualisation. Note that having the number of bins
+ proportional to :math:`n^{1/3}` is asymptotically optimal, which is
+ why it appears in most estimators. These are simply plug-in methods
+ that give good starting points for number of bins. In the equations
+ below, :math:`h` is the binwidth and :math:`n_h` is the number of
+ bins. All estimators that compute bin counts are recast to bin width
+ using the `ptp` of the data. The final bin count is obtained from
+ ``np.round(np.ceil(range / h))`.
'Auto' (maximum of the 'Sturges' and 'FD' estimators)
A compromise to get a good value. For small datasets the Sturges
@@ -474,14 +488,14 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
estimator in the absence of outliers.
'Rice'
- .. math:: n_h = \left\lceil 2n^{1/3} \right\rceil
+ .. math:: n_h = 2n^{1/3}
The number of bins is only proportional to cube root of
``a.size``. It tends to overestimate the number of bins and it
does not take into account data variability.
'Sturges'
- .. math:: n_h = \left\lceil \log _{2}n+1 \right\rceil
+ .. math:: n_h = \log _{2}n+1
The number of bins is the base 2 log of ``a.size``. This
estimator assumes normality of data and is too conservative for
@@ -489,19 +503,19 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
``hist`` method.
'Doane'
- .. math:: n_h = \left\lceil 1 + \log_{2}(n) +
- \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})}
- \right\rceil
+ .. math:: n_h = 1 + \log_{2}(n) +
+ \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1})}
g_1 = mean[(\frac{x - \mu}{\sigma})^3]
\sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
An improved version of Sturges' formula that produces better
- estimates for non-normal datasets.
+ estimates for non-normal datasets. This estimator attempts to
+ account for the skew of the data.
'Sqrt'
- .. math:: n_h = \left\lceil \sqrt n \right\rceil
+ .. math:: n_h = \sqrt n
The simplest and fastest estimator. Only takes into account the
data size.
@@ -569,7 +583,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
# if `bins` is a string for an automatic method,
# this will replace it with the number of bins calculated
if bins not in _hist_bin_selectors:
- raise ValueError("{} not a valid estimator for bins".format(bins))
+ raise ValueError("{0} not a valid estimator for bins".format(bins))
if weights is not None:
raise TypeError("Automated estimation of the number of "
"bins is not supported for weighted data")