summaryrefslogtreecommitdiff
path: root/Lib/statistics.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/statistics.py')
-rw-r--r--Lib/statistics.py151
1 files changed, 92 insertions, 59 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 4f5c1c164a..30fe55c86a 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -1,20 +1,3 @@
-## Module statistics.py
-##
-## Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
-##
-## Licensed under the Apache License, Version 2.0 (the "License");
-## you may not use this file except in compliance with the License.
-## You may obtain a copy of the License at
-##
-## http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-
"""
Basic statistics module.
@@ -28,6 +11,7 @@ Calculating averages
Function Description
================== =============================================
mean Arithmetic mean (average) of data.
+harmonic_mean Harmonic mean of data.
median Median (middle value) of data.
median_low Low median of data.
median_high High median of data.
@@ -95,16 +79,18 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
__all__ = [ 'StatisticsError',
'pstdev', 'pvariance', 'stdev', 'variance',
'median', 'median_low', 'median_high', 'median_grouped',
- 'mean', 'mode',
+ 'mean', 'mode', 'harmonic_mean',
]
-
import collections
+import decimal
import math
+import numbers
from fractions import Fraction
from decimal import Decimal
-from itertools import groupby
+from itertools import groupby, chain
+from bisect import bisect_left, bisect_right
@@ -134,7 +120,8 @@ def _sum(data, start=0):
Some sources of round-off error will be avoided:
- >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero.
+ # Built-in sum returns zero.
+ >>> _sum([1e50, 1, -1e50] * 1000)
(<class 'float'>, Fraction(1000, 1), 3000)
Fractions and Decimals are also supported:
@@ -223,56 +210,26 @@ def _exact_ratio(x):
# Optimise the common case of floats. We expect that the most often
# used numeric type will be builtin floats, so try to make this as
# fast as possible.
- if type(x) is float:
+ if type(x) is float or type(x) is Decimal:
return x.as_integer_ratio()
try:
# x may be an int, Fraction, or Integral ABC.
return (x.numerator, x.denominator)
except AttributeError:
try:
- # x may be a float subclass.
+ # x may be a float or Decimal subclass.
return x.as_integer_ratio()
except AttributeError:
- try:
- # x may be a Decimal.
- return _decimal_to_ratio(x)
- except AttributeError:
- # Just give up?
- pass
+ # Just give up?
+ pass
except (OverflowError, ValueError):
# float NAN or INF.
- assert not math.isfinite(x)
+ assert not _isfinite(x)
return (x, None)
msg = "can't convert type '{}' to numerator/denominator"
raise TypeError(msg.format(type(x).__name__))
-# FIXME This is faster than Fraction.from_decimal, but still too slow.
-def _decimal_to_ratio(d):
- """Convert Decimal d to exact integer ratio (numerator, denominator).
-
- >>> from decimal import Decimal
- >>> _decimal_to_ratio(Decimal("2.6"))
- (26, 10)
-
- """
- sign, digits, exp = d.as_tuple()
- if exp in ('F', 'n', 'N'): # INF, NAN, sNAN
- assert not d.is_finite()
- return (d, None)
- num = 0
- for digit in digits:
- num = num*10 + digit
- if exp < 0:
- den = 10**-exp
- else:
- num *= 10**exp
- den = 1
- if sign:
- num = -num
- return (num, den)
-
-
def _convert(value, T):
"""Convert value to given numeric type T."""
if type(value) is T:
@@ -305,6 +262,30 @@ def _counts(data):
return table
+def _find_lteq(a, x):
+ 'Locate the leftmost value exactly equal to x'
+ i = bisect_left(a, x)
+ if i != len(a) and a[i] == x:
+ return i
+ raise ValueError
+
+
+def _find_rteq(a, l, x):
+ 'Locate the rightmost value exactly equal to x'
+ i = bisect_right(a, x, lo=l)
+ if i != (len(a)+1) and a[i-1] == x:
+ return i-1
+ raise ValueError
+
+
+def _fail_neg(values, errmsg='negative value'):
+ """Iterate over values, failing if any are less than zero."""
+ for x in values:
+ if x < 0:
+ raise StatisticsError(errmsg)
+ yield x
+
+
# === Measures of central tendency (averages) ===
def mean(data):
@@ -333,6 +314,52 @@ def mean(data):
return _convert(total/n, T)
+def harmonic_mean(data):
+ """Return the harmonic mean of data.
+
+ The harmonic mean, sometimes called the subcontrary mean, is the
+ reciprocal of the arithmetic mean of the reciprocals of the data,
+ and is often appropriate when averaging quantities which are rates
+ or ratios, for example speeds. Example:
+
+ Suppose an investor purchases an equal value of shares in each of
+ three companies, with P/E (price/earning) ratios of 2.5, 3 and 10.
+ What is the average P/E ratio for the investor's portfolio?
+
+ >>> harmonic_mean([2.5, 3, 10]) # For an equal investment portfolio.
+ 3.6
+
+ Using the arithmetic mean would give an average of about 5.167, which
+ is too high.
+
+ If ``data`` is empty, or any element is less than zero,
+ ``harmonic_mean`` will raise ``StatisticsError``.
+ """
+ # For a justification for using harmonic mean for P/E ratios, see
+ # http://fixthepitch.pellucid.com/comps-analysis-the-missing-harmony-of-summary-statistics/
+ # http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2621087
+ if iter(data) is data:
+ data = list(data)
+ errmsg = 'harmonic mean does not support negative values'
+ n = len(data)
+ if n < 1:
+ raise StatisticsError('harmonic_mean requires at least one data point')
+ elif n == 1:
+ x = data[0]
+ if isinstance(x, (numbers.Real, Decimal)):
+ if x < 0:
+ raise StatisticsError(errmsg)
+ return x
+ else:
+ raise TypeError('unsupported type')
+ try:
+ T, total, count = _sum(1/x for x in _fail_neg(data, errmsg))
+ except ZeroDivisionError:
+ return 0
+ assert count == n
+ return _convert(n/total, T)
+
+
# FIXME: investigate ways to calculate medians without sorting? Quickselect?
def median(data):
"""Return the median (middle value) of numeric data.
@@ -442,9 +469,15 @@ def median_grouped(data, interval=1):
except TypeError:
# Mixed type. For now we just coerce to float.
L = float(x) - float(interval)/2
- cf = data.index(x) # Number of values below the median interval.
- # FIXME The following line could be more efficient for big lists.
- f = data.count(x) # Number of data points in the median interval.
+
+ # Uses bisection search to search for x in data with log(n) time complexity
+ # Find the position of leftmost occurrence of x in data
+ l1 = _find_lteq(data, x)
+ # Find the position of rightmost occurrence of x in data[l1...len(data)]
+ # Assuming always l1 <= l2
+ l2 = _find_rteq(data, l1, x)
+ cf = l1
+ f = l2 - l1 + 1
return L + interval*(n/2 - cf)/f