diff options
Diffstat (limited to 'numpy/lib')
-rw-r--r-- | numpy/lib/function_base.py | 593 | ||||
-rw-r--r-- | numpy/lib/nanfunctions.py | 212 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 177 |
3 files changed, 788 insertions, 194 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 20e32a78d..67a34f6e1 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -52,6 +52,89 @@ __all__ = [ ] +_QuantileInterpolation = dict( + # --- HYNDMAN and FAN methods + # Discrete methods + inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles), + fix_gamma=lambda gamma, _: gamma, # should never be called + ), + averaged_inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: (n * quantiles) - 1, + fix_gamma=lambda gamma, _: _get_gamma_mask( + shape=gamma.shape, + default_value=1., + conditioned_value=0.5, + where=gamma == 0), + ), + closest_observation=dict( + get_virtual_index=lambda n, quantiles: _closest_observation(n, + quantiles), + fix_gamma=lambda gamma, _: gamma, # should never be called + ), + # Continuous methods + interpolated_inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0, 1), + fix_gamma=lambda gamma, _: gamma, + ), + hazen=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0.5, 0.5), + fix_gamma=lambda gamma, _: gamma, + ), + weibull=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0, 0), + fix_gamma=lambda gamma, _: gamma, + ), + # Default value + linear=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 1, 1), + fix_gamma=lambda gamma, _: gamma, + ), + median_unbiased=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 1 / 3.0, 1 / 3.0), + fix_gamma=lambda gamma, _: gamma, + ), + normal_unbiased=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 3 / 8.0, 3 / 8.0), + fix_gamma=lambda gamma, _: gamma, + ), + # --- OTHER METHODS fixme add deprecated ? + lower=dict( + get_virtual_index=lambda n, quantiles: np.floor( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + ), + higher=dict( + get_virtual_index=lambda n, quantiles: np.ceil( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + ), + midpoint=dict( + get_virtual_index=lambda n, quantiles: 0.5 * ( + np.floor((n - 1) * quantiles) + + np.ceil((n - 1) * quantiles)), + fix_gamma=lambda gamma, index: _get_gamma_mask( + shape=gamma.shape, + default_value=0.5, + conditioned_value=0., + where=index % 1 == 0), + ), + nearest=dict( + get_virtual_index=lambda n, quantiles: np.around( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + )) + + def _rot90_dispatcher(m, k=None, axes=None): return (m,) @@ -3760,8 +3843,13 @@ def _percentile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_percentile_dispatcher) -def percentile(a, q, axis=None, out=None, - overwrite_input=False, interpolation='linear', keepdims=False): +def percentile(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """ Compute the q-th percentile of the data along the specified axis. @@ -3790,20 +3878,75 @@ def percentile(a, q, axis=None, out=None, calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired percentile lies between two data points - ``i < j``: + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * inclusive: + Default method, aliased with "linear". + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. - * 'linear': ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. - * 'lower': ``i``. - * 'higher': ``j``. - * 'nearest': ``i`` or ``j``, whichever is nearest. - * 'midpoint': ``(i + j) / 2``. - - .. versionadded:: 1.9.0 keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -3897,6 +4040,11 @@ def percentile(a, q, axis=None, out=None, ax.legend() plt.show() + References + ---------- + [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). + Sample quantiles in statistical packages. + The American Statistician, 50(4), 361-365. """ q = np.true_divide(q, 100) q = asanyarray(q) # undo any decay that the ufunc performed (see gh-13105) @@ -3912,8 +4060,13 @@ def _quantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_quantile_dispatcher) -def quantile(a, q, axis=None, out=None, - overwrite_input=False, interpolation='linear', keepdims=False): +def quantile(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """ Compute the q-th quantile of the data along the specified axis. @@ -3938,18 +4091,75 @@ def quantile(a, q, axis=None, out=None, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points - ``i < j``: - - * linear: ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * linear: + Default method. + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -4010,6 +4220,12 @@ def quantile(a, q, axis=None, out=None, >>> np.quantile(b, 0.5, axis=1, overwrite_input=True) array([7., 2.]) >>> assert not np.all(a == b) + + References + ---------- + [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). + Sample quantiles in statistical packages. + The American Statistician, 50(4), 361-365. """ q = np.asanyarray(q) if not _quantile_is_valid(q): @@ -4018,10 +4234,19 @@ def quantile(a, q, axis=None, out=None, a, q, axis, out, overwrite_input, interpolation, keepdims) -def _quantile_unchecked(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=False): +def _quantile_unchecked(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """Assumes that q is in [0, 1], and is an ndarray""" - r, k = _ureduce(a, func=_quantile_ureduce_func, q=q, axis=axis, out=out, + r, k = _ureduce(a, + func=_quantiles_ureduce_func, + q=q, + axis=axis, + out=out, overwrite_input=overwrite_input, interpolation=interpolation) if keepdims: @@ -4042,122 +4267,242 @@ def _quantile_is_valid(q): return True -def _lerp(a, b, t, out=None): - """ Linearly interpolate from a to b by a factor of t """ - diff_b_a = subtract(b, a) - # asanyarray is a stop-gap until gh-13105 - lerp_interpolation = asanyarray(add(a, diff_b_a*t, out=out)) - subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t>=0.5) - if lerp_interpolation.ndim == 0 and out is None: - lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays - return lerp_interpolation +def _virtual_index_formula(n: np.array, + quantiles: np.array, + alpha: float, + beta: float) -> np.array: + """ + Compute the floating point indexes of an array for the linear + interpolation of quantiles. + Reference: + Hyndman&Fan paper "Sample Quantiles in Statistical Packages", + DOI: 10.1080/00031305.1996.10473566 + """ + return n * quantiles + ( + alpha + quantiles * (1 - alpha - beta) + ) - 1 -def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=False): - a = asarray(a) +def _get_gamma(virtual_indexes: np.array, + previous_indexes: np.array, + interpolation: _QuantileInterpolation) -> np.array: + """ + Compute the gamma (a.k.a 'm' or weight) for the linear interpolation + of quantiles. + When gamma == 0 the left bound will be picked, + When gamma == 1 the right bound will be. + """ + gamma = np.asanyarray(virtual_indexes - previous_indexes) + gamma = interpolation["fix_gamma"](gamma, virtual_indexes) + return np.asanyarray(gamma) - # ufuncs cause 0d array results to decay to scalars (see gh-13105), which - # makes them problematic for __setitem__ and attribute access. As a - # workaround, we call this on the result of every ufunc on a possibly-0d - # array. - not_scalar = np.asanyarray - # prepare a for partitioning - if overwrite_input: - if axis is None: - ap = a.ravel() - else: - ap = a - else: - if axis is None: - ap = a.flatten() - else: - ap = a.copy() +def _linear_interpolation_formula( + left: np.array, right: np.array, gamma: np.array, out: np.array = None +) -> np.array: + """ + Compute the linear interpolation weighted by gamma on each point of + two same shape array. + """ + # Equivalent to gamma * right + (1 - gamma) * left + # see gh-14685 + diff_right_left = subtract(right, left) + result = asanyarray(add(left, diff_right_left * gamma, out=out)) + result = subtract(right, + diff_right_left * (1 - gamma), + out=result, + where=gamma >= 0.5) + return result - if axis is None: - axis = 0 +def _get_gamma_mask(shape, default_value, conditioned_value, where): + out = np.full(shape, default_value) + out[where] = conditioned_value + return out + + +def _discret_interpolation_to_boundaries(index, gamma_condition_fun): + previous = np.floor(index) + next = previous + 1 + gamma = index - previous + return _get_gamma_mask(shape=index.shape, + default_value=next, + conditioned_value=previous, + where=gamma_condition_fun(gamma, index) + ).astype(np.intp) + + +def _closest_observation(n, quantiles): + gamma_fun = lambda gamma, index: (gamma == 0) & (np.floor(index) % 2 == 0) + return _discret_interpolation_to_boundaries((n * quantiles) - 1 - 0.5, + gamma_fun) + + +def _inverted_cdf(n, quantiles): + gamma_fun = lambda gamma, _: (gamma == 0) + return _discret_interpolation_to_boundaries((n * quantiles) - 1, + gamma_fun) + + +def _quantiles_ureduce_func( + a: np.array, + q: np.array, + axis: int = None, + out=None, + overwrite_input: bool = False, + interpolation="linear", +) -> np.array: if q.ndim > 2: # The code below works fine for nd, but it might not have useful # semantics. For now, keep the supported dimensions the same as it was # before. raise ValueError("q must be a scalar or 1d") - - Nx = ap.shape[axis] - indices = not_scalar(q * (Nx - 1)) - # round fractional indices according to interpolation method - if interpolation == 'lower': - indices = floor(indices).astype(intp) - elif interpolation == 'higher': - indices = ceil(indices).astype(intp) - elif interpolation == 'midpoint': - indices = 0.5 * (floor(indices) + ceil(indices)) - elif interpolation == 'nearest': - indices = around(indices).astype(intp) - elif interpolation == 'linear': - pass # keep index as fraction and interpolate + if overwrite_input: + if axis is None: + axis = 0 + arr = a.ravel() + else: + arr = a else: - raise ValueError( - "interpolation can only be 'linear', 'lower' 'higher', " - "'midpoint', or 'nearest'") + if axis is None: + axis = 0 + arr = a.flatten() + else: + arr = a.copy() + result = _quantile(arr, + quantiles=q, + axis=axis, + interpolation=interpolation, + out=out) + if result.ndim == 0 and out is None: + result = result[()] # unpack 0d arrays + elif result.size == 1 and out is None and q.ndim == 0: + result = result[0] + return result - # The dimensions of `q` are prepended to the output shape, so we need the - # axis being sampled from `ap` to be first. - ap = np.moveaxis(ap, axis, 0) - del axis - if np.issubdtype(indices.dtype, np.integer): - # take the points along axis +def _get_indexes(arr, virtual_indexes, valid_values_count): + """ + Get the valid indexes of arr neighbouring virtual_indexes. + Note + This is a companion function to linear interpolation of + Quantiles - if np.issubdtype(a.dtype, np.inexact): + Returns + ------- + (previous_indexes, next_indexes): Tuple + A Tuple of virtual_indexes neighbouring indexes + """ + previous_indexes = np.asanyarray(np.floor(virtual_indexes)) + next_indexes = np.asanyarray(previous_indexes + 1) + indexes_above_bounds = virtual_indexes >= valid_values_count - 1 + # When indexes is above max index, take the max value of the array + if indexes_above_bounds.any(): + previous_indexes[indexes_above_bounds] = -1 + next_indexes[indexes_above_bounds] = -1 + # When indexes is below min index, take the min value of the array + indexes_below_bounds = virtual_indexes < 0 + if indexes_below_bounds.any(): + previous_indexes[indexes_below_bounds] = 0 + next_indexes[indexes_below_bounds] = 0 + if np.issubdtype(arr.dtype, np.inexact): + # After the sort, slices having NaNs will have for last element a NaN + virtual_indexes_nans = np.isnan(virtual_indexes) + if virtual_indexes_nans.any(): + previous_indexes[virtual_indexes_nans] = -1 + next_indexes[virtual_indexes_nans] = -1 + previous_indexes = previous_indexes.astype(np.intp) + next_indexes = next_indexes.astype(np.intp) + return previous_indexes, next_indexes + + +def _quantile( + arr: np.array, + quantiles: np.array, + axis: int = -1, + interpolation="linear", + out=None, +): + """ + Private function that doesn't support extended axis or keepdims. + These methods are extended to this function using _ureduce + See nanpercentile for parameter usage + It computes the quantiles of the array for the given axis. + A linear interpolation is performed based on the `interpolation`. + + By default, the interpolation is "linear" where + alpha == beta == 1 which performs the 7th method of Hyndman&Fan. + With "median_unbiased" we get alpha == beta == 1/3 + thus the 8th method of Hyndman&Fan. + """ + # --- Setup + arr = np.asanyarray(arr) + values_count = arr.shape[axis] + # The dimensions of `q` are prepended to the output shape, so we need the + # axis being sampled from `arr` to be last. + DATA_AXIS = 0 + arr = np.moveaxis(arr, axis, destination=DATA_AXIS) + # --- Computation of indexes + # Index where to find the value in the sorted array. + # Virtual because it is a floating point value, not an valid index. + # The nearest neighbours are used for interpolation + try: + interpolation = _QuantileInterpolation[interpolation] + except KeyError: + raise ValueError( + f"{interpolation!r} is not a valid interpolation. Use one of: " + f"{_QuantileInterpolation.keys()}") + virtual_indexes = interpolation["get_virtual_index"](values_count, + quantiles) + virtual_indexes = np.asanyarray(virtual_indexes) + if np.issubdtype(virtual_indexes.dtype, np.integer): + # No interpolation needed, take the points along axis + if np.issubdtype(arr.dtype, np.inexact): # may contain nan, which would sort to the end - ap.partition(concatenate((indices.ravel(), [-1])), axis=0) - n = np.isnan(ap[-1]) + arr.partition(concatenate((virtual_indexes.ravel(), [-1])), axis=0) + slices_having_nans = np.isnan(arr[-1]) else: # cannot contain nan - ap.partition(indices.ravel(), axis=0) - n = np.array(False, dtype=bool) - - r = take(ap, indices, axis=0, out=out) - + arr.partition(virtual_indexes.ravel(), axis=0) + slices_having_nans = np.array(False, dtype=bool) + result = np.asanyarray(take(arr, virtual_indexes, axis=0, out=out)) else: - # weight the points above and below the indices - - indices_below = not_scalar(floor(indices)).astype(intp) - indices_above = not_scalar(indices_below + 1) - indices_above[indices_above > Nx - 1] = Nx - 1 - - if np.issubdtype(a.dtype, np.inexact): - # may contain nan, which would sort to the end - ap.partition(concatenate(( - indices_below.ravel(), indices_above.ravel(), [-1] - )), axis=0) - n = np.isnan(ap[-1]) + previous_indexes, next_indexes = _get_indexes(arr, + virtual_indexes, + values_count) + # --- Sorting + arr.partition( + np.unique(np.concatenate(([0, -1], + previous_indexes.ravel(), + next_indexes.ravel(), + ))), + axis=DATA_AXIS) + if np.issubdtype(arr.dtype, np.inexact): + slices_having_nans = np.isnan( + take(arr, indices=-1, axis=DATA_AXIS) + ) else: - # cannot contain nan - ap.partition(concatenate(( - indices_below.ravel(), indices_above.ravel() - )), axis=0) - n = np.array(False, dtype=bool) - - weights_shape = indices.shape + (1,) * (ap.ndim - 1) - weights_above = not_scalar(indices - indices_below).reshape(weights_shape) - - x_below = take(ap, indices_below, axis=0) - x_above = take(ap, indices_above, axis=0) - - r = _lerp(x_below, x_above, weights_above, out=out) - - # if any slice contained a nan, then all results on that slice are also nan - if np.any(n): - if r.ndim == 0 and out is None: + slices_having_nans = None + # --- Get values from indexes + previous = np.take(arr, previous_indexes, axis=DATA_AXIS) + next = np.take(arr, next_indexes, axis=DATA_AXIS) + # --- Linear interpolation + gamma = _get_gamma(virtual_indexes, + previous_indexes, + interpolation) + result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) + gamma = gamma.reshape(result_shape) + result = _linear_interpolation_formula(previous, + next, + gamma, + out=out) + if np.any(slices_having_nans): + if result.ndim == 0 and out is None: # can't write to a scalar - r = a.dtype.type(np.nan) + result = np.array(np.nan, dtype=arr.dtype) else: - r[..., n] = a.dtype.type(np.nan) - - return r + result[..., slices_having_nans] = np.nan + return result def _trapz_dispatcher(y, x=None, dx=None, axis=None): diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index 08d9b42bb..e90c19b4a 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -23,6 +23,7 @@ Functions import functools import warnings import numpy as np +from numpy.lib.function_base import _QuantileInterpolation from numpy.lib import function_base from numpy.core import overrides @@ -1229,8 +1230,15 @@ def _nanpercentile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_nanpercentile_dispatcher) -def nanpercentile(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=np._NoValue): +def nanpercentile( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=np._NoValue, +): """ Compute the qth percentile of the data along the specified axis, while ignoring nan values. @@ -1259,18 +1267,74 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired percentile lies between two data points - ``i < j``: - - * 'linear': ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. - * 'lower': ``i``. - * 'higher': ``j``. - * 'nearest': ``i`` or ``j``, whichever is nearest. - * 'midpoint': ``(i + j) / 2``. + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * linear: + Default method. + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -1342,7 +1406,9 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False, """ a = np.asanyarray(a) - q = np.true_divide(q, 100.0) # handles the asarray for us too + q = np.true_divide(q, 100.0) + # undo any decay that the ufunc performed (see gh-13105) + q = np.asanyarray(q) if not function_base._quantile_is_valid(q): raise ValueError("Percentiles must be in the range [0, 100]") return _nanquantile_unchecked( @@ -1355,8 +1421,15 @@ def _nanquantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_nanquantile_dispatcher) -def nanquantile(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=np._NoValue): +def nanquantile( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=np._NoValue, +): """ Compute the qth quantile of the data along the specified axis, while ignoring nan values. @@ -1384,19 +1457,74 @@ def nanquantile(a, q, axis=None, out=None, overwrite_input=False, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points - ``i < j``: - - * linear: ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * linear: + Default method. + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 * lower: ``i``. * higher: ``j``. * nearest: ``i`` or ``j``, whichever is nearest. * midpoint: ``(i + j) / 2``. - keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -1462,26 +1590,39 @@ def nanquantile(a, q, axis=None, out=None, overwrite_input=False, a, q, axis, out, overwrite_input, interpolation, keepdims) -def _nanquantile_unchecked(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=np._NoValue): +def _nanquantile_unchecked( + a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=np._NoValue, +): """Assumes that q is in [0, 1], and is an ndarray""" # apply_along_axis in _nanpercentile doesn't handle empty arrays well, # so deal them upfront if a.size == 0: return np.nanmean(a, axis, out=out, keepdims=keepdims) - - r, k = function_base._ureduce( - a, func=_nanquantile_ureduce_func, q=q, axis=axis, out=out, - overwrite_input=overwrite_input, interpolation=interpolation - ) + r, k = function_base._ureduce(a, + func=_nanquantile_ureduce_func, + q=q, + axis=axis, + out=out, + overwrite_input=overwrite_input, + interpolation=interpolation) if keepdims and keepdims is not np._NoValue: return r.reshape(q.shape + k) else: return r -def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear'): +def _nanquantile_ureduce_func(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation= "linear"): """ Private function that doesn't support extended axis or keepdims. These methods are extended to this function using _ureduce @@ -1504,7 +1645,10 @@ def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, return result -def _nanquantile_1d(arr1d, q, overwrite_input=False, interpolation='linear'): +def _nanquantile_1d(arr1d, + q, + overwrite_input=False, + interpolation= "linear"): """ Private function for rank 1 arrays. Compute quantile ignoring NaNs. See nanpercentile for parameter usage diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index c7dfe5673..4f0db7bdb 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -2903,36 +2903,134 @@ class TestPercentile: [1, 1, 1]]) assert_array_equal(np.percentile(x, 50, axis=0), [1, 1, 1]) - def test_linear(self): - + @pytest.mark.parametrize("dtype", np.typecodes["AllFloat"]) + def test_linear_nan_1D(self, dtype): + # METHOD 1 of H&F + arr = np.asarray([15.0, np.NAN, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="linear") + np.testing.assert_equal(res, np.NAN) + np.testing.assert_equal(res.dtype, arr.dtype) + + TYPE_CODES = np.typecodes["AllInteger"] + np.typecodes["AllFloat"] + "O" + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_inverted_cdf(self, dtype): + # METHOD 1 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="inverted_cdf") + np.testing.assert_almost_equal(res, 20, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_averaged_inverted_cdf(self, dtype): + # METHOD 2 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="averaged_inverted_cdf") + np.testing.assert_almost_equal(res, 27.5, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_closest_observation(self, dtype): + # METHOD 3 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="closest_observation") + np.testing.assert_almost_equal(res, 20, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_interpolated_inverted_cdf(self, dtype): + # METHOD 4 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="interpolated_inverted_cdf") + np.testing.assert_almost_equal(res, 20, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_hazen(self, dtype): + # METHOD 5 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="hazen") + np.testing.assert_almost_equal(res, 27.5, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_weibull(self, dtype): + # METHOD 6 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="weibull") + np.testing.assert_almost_equal(res, 26, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_linear(self, dtype): + # METHOD 7 of H&F # Test defaults assert_equal(np.percentile(range(10), 50), 4.5) - - # explicitly specify interpolation_method 'linear' (the default) - assert_equal(np.percentile(range(10), 50, - interpolation='linear'), 4.5) - - def test_lower_higher(self): - - # interpolation_method 'lower'/'higher' - assert_equal(np.percentile(range(10), 50, + # explicit interpolation_method (the default) + res = np.percentile([15.0, 20.0, 35.0, 40.0, 50.0], + 40, + interpolation="linear") + np.testing.assert_almost_equal(res, 29, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_median_unbiased(self, dtype): + # METHOD 8 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="median_unbiased") + np.testing.assert_almost_equal(res, 27, 14) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_linear_normal_unbiased(self, dtype): + # METHOD 9 of H&F + arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=dtype) + res = np.percentile( + arr, + 40.0, + interpolation="normal_unbiased") + np.testing.assert_almost_equal(res, 27.125, 15) + + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_lower_higher(self, dtype): + assert_equal(np.percentile(np.arange(10, dtype=dtype), 50, interpolation='lower'), 4) - assert_equal(np.percentile(range(10), 50, + assert_equal(np.percentile(np.arange(10, dtype=dtype), 50, interpolation='higher'), 5) - def test_midpoint(self): - assert_equal(np.percentile(range(10), 51, + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_midpoint(self, dtype): + assert_equal(np.percentile(np.arange(10, dtype=dtype), 51, interpolation='midpoint'), 4.5) - assert_equal(np.percentile(range(11), 51, + assert_equal(np.percentile(np.arange(9, dtype=dtype) + 1, 50, + interpolation='midpoint'), 5) + assert_equal(np.percentile(np.arange(11, dtype=dtype), 51, interpolation='midpoint'), 5.5) - assert_equal(np.percentile(range(11), 50, + assert_equal(np.percentile(np.arange(11, dtype=dtype), 50, interpolation='midpoint'), 5) - def test_nearest(self): - assert_equal(np.percentile(range(10), 51, + @pytest.mark.parametrize("dtype", TYPE_CODES) + def test_nearest(self, dtype): + assert_equal(np.percentile(np.arange(10, dtype=dtype), 51, interpolation='nearest'), 5) - assert_equal(np.percentile(range(10), 49, - interpolation='nearest'), 4) + assert_equal(np.percentile(np.arange(10, dtype=dtype), 49, + interpolation='nearest'), 4) def test_sequence(self): x = np.arange(8) * 0.5 @@ -3038,18 +3136,18 @@ class TestPercentile: y = np.zeros((3,)) p = (1, 2, 3) np.percentile(x, p, out=y) - assert_equal(y, np.percentile(x, p)) + assert_equal(np.percentile(x, p), y) x = np.array([[1, 2, 3], [4, 5, 6]]) y = np.zeros((3, 3)) np.percentile(x, p, axis=0, out=y) - assert_equal(y, np.percentile(x, p, axis=0)) + assert_equal(np.percentile(x, p, axis=0), y) y = np.zeros((3, 2)) np.percentile(x, p, axis=1, out=y) - assert_equal(y, np.percentile(x, p, axis=1)) + assert_equal(np.percentile(x, p, axis=1), y) x = np.arange(12).reshape(3, 4) # q.dim > 1, float @@ -3293,6 +3391,7 @@ class TestPercentile: with pytest.raises(ValueError, match="Percentiles must be in"): np.percentile([1, 2, 3, 4.0], q) + class TestQuantile: # most of this is already tested by TestPercentile @@ -3302,6 +3401,7 @@ class TestQuantile: assert_equal(np.quantile(x, 1), 3.5) assert_equal(np.quantile(x, 0.5), 1.75) + @pytest.mark.xfail(reason="See gh-19154") def test_correct_quantile_value(self): a = np.array([True]) tf_quant = np.quantile(True, False) @@ -3310,12 +3410,11 @@ class TestQuantile: a = np.array([False, True, True]) quant_res = np.quantile(a, a) assert_array_equal(quant_res, a) - assert_equal(a.dtype, quant_res.dtype) + assert_equal(quant_res.dtype, a.dtype) def test_fraction(self): # fractional input, integral quantile x = [Fraction(i, 2) for i in range(8)] - q = np.quantile(x, 0) assert_equal(q, 0) assert_equal(type(q), Fraction) @@ -3352,6 +3451,12 @@ class TestQuantile: np.quantile(np.arange(100.), p, interpolation="midpoint") assert_array_equal(p, p0) + @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) + def test_quantile_preserve_int_type(self, dtype): + res = np.quantile(np.array([1, 2], dtype=dtype), [0.5], + interpolation="nearest") + assert res.dtype == dtype + def test_quantile_monotonic(self): # GH 14685 # test that the return value of quantile is monotonic if p0 is ordered @@ -3380,9 +3485,9 @@ class TestLerp: min_value=-1e300, max_value=1e300), b = st.floats(allow_nan=False, allow_infinity=False, min_value=-1e300, max_value=1e300)) - def test_lerp_monotonic(self, t0, t1, a, b): - l0 = np.lib.function_base._lerp(a, b, t0) - l1 = np.lib.function_base._lerp(a, b, t1) + def test_linear_interpolation_formula_monotonic(self, t0, t1, a, b): + l0 = nfb._linear_interpolation_formula(a, b, t0) + l1 = nfb._linear_interpolation_formula(a, b, t1) if t0 == t1 or a == b: assert l0 == l1 # uninteresting elif (t0 < t1) == (a < b): @@ -3396,11 +3501,11 @@ class TestLerp: min_value=-1e300, max_value=1e300), b=st.floats(allow_nan=False, allow_infinity=False, min_value=-1e300, max_value=1e300)) - def test_lerp_bounded(self, t, a, b): + def test_linear_interpolation_formula_bounded(self, t, a, b): if a <= b: - assert a <= np.lib.function_base._lerp(a, b, t) <= b + assert a <= nfb._linear_interpolation_formula(a, b, t) <= b else: - assert b <= np.lib.function_base._lerp(a, b, t) <= a + assert b <= nfb._linear_interpolation_formula(a, b, t) <= a @hypothesis.given(t=st.floats(allow_nan=False, allow_infinity=False, min_value=0, max_value=1), @@ -3408,17 +3513,17 @@ class TestLerp: min_value=-1e300, max_value=1e300), b=st.floats(allow_nan=False, allow_infinity=False, min_value=-1e300, max_value=1e300)) - def test_lerp_symmetric(self, t, a, b): + def test_linear_interpolation_formula_symmetric(self, t, a, b): # double subtraction is needed to remove the extra precision of t < 0.5 - left = np.lib.function_base._lerp(a, b, 1 - (1 - t)) - right = np.lib.function_base._lerp(b, a, 1 - t) + left = nfb._linear_interpolation_formula(a, b, 1 - (1 - t)) + right = nfb._linear_interpolation_formula(b, a, 1 - t) assert left == right - def test_lerp_0d_inputs(self): + def test_linear_interpolation_formula_0d_inputs(self): a = np.array(2) b = np.array(5) t = np.array(0.2) - assert np.lib.function_base._lerp(a, b, t) == 2.6 + assert nfb._linear_interpolation_formula(a, b, t) == 2.6 class TestMedian: |