summaryrefslogtreecommitdiff
path: root/Lib/heapq.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/heapq.py')
-rw-r--r--Lib/heapq.py349
1 files changed, 240 insertions, 109 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index d615239b94..07af37e717 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -127,8 +127,6 @@ From all times, sorting has always been a Great Art! :-)
__all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
'nlargest', 'nsmallest', 'heappushpop']
-from itertools import islice, count, tee, chain
-
def heappush(heap, item):
"""Push item onto heap, maintaining the heap invariant."""
heap.append(item)
@@ -141,9 +139,8 @@ def heappop(heap):
returnitem = heap[0]
heap[0] = lastelt
_siftup(heap, 0)
- else:
- returnitem = lastelt
- return returnitem
+ return returnitem
+ return lastelt
def heapreplace(heap, item):
"""Pop and return the current smallest value, and add the new item.
@@ -179,12 +176,22 @@ def heapify(x):
for i in reversed(range(n//2)):
_siftup(x, i)
-def _heappushpop_max(heap, item):
- """Maxheap version of a heappush followed by a heappop."""
- if heap and item < heap[0]:
- item, heap[0] = heap[0], item
+def _heappop_max(heap):
+ """Maxheap version of a heappop."""
+ lastelt = heap.pop() # raises appropriate IndexError if heap is empty
+ if heap:
+ returnitem = heap[0]
+ heap[0] = lastelt
_siftup_max(heap, 0)
- return item
+ return returnitem
+ return lastelt
+
+def _heapreplace_max(heap, item):
+ """Maxheap version of a heappop followed by a heappush."""
+ returnitem = heap[0] # raises appropriate IndexError if heap is empty
+ heap[0] = item
+ _siftup_max(heap, 0)
+ return returnitem
def _heapify_max(x):
"""Transform list into a maxheap, in-place, in O(len(x)) time."""
@@ -192,42 +199,6 @@ def _heapify_max(x):
for i in reversed(range(n//2)):
_siftup_max(x, i)
-def nlargest(n, iterable):
- """Find the n largest elements in a dataset.
-
- Equivalent to: sorted(iterable, reverse=True)[:n]
- """
- if n < 0:
- return []
- it = iter(iterable)
- result = list(islice(it, n))
- if not result:
- return result
- heapify(result)
- _heappushpop = heappushpop
- for elem in it:
- _heappushpop(result, elem)
- result.sort(reverse=True)
- return result
-
-def nsmallest(n, iterable):
- """Find the n smallest elements in a dataset.
-
- Equivalent to: sorted(iterable)[:n]
- """
- if n < 0:
- return []
- it = iter(iterable)
- result = list(islice(it, n))
- if not result:
- return result
- _heapify_max(result)
- _heappushpop = _heappushpop_max
- for elem in it:
- _heappushpop(result, elem)
- result.sort()
- return result
-
# 'heap' is a heap at all indices >= startpos, except possibly for pos. pos
# is the index of a leaf with a possibly out-of-order value. Restore the
# heap invariant.
@@ -340,13 +311,7 @@ def _siftup_max(heap, pos):
heap[pos] = newitem
_siftdown_max(heap, startpos, pos)
-# If available, use C implementation
-try:
- from _heapq import *
-except ImportError:
- pass
-
-def merge(*iterables):
+def merge(*iterables, key=None, reverse=False):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
@@ -356,51 +321,158 @@ def merge(*iterables):
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
+ If *key* is not None, applies a key function to each element to determine
+ its sort order.
+
+ >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
+ ['dog', 'cat', 'fish', 'horse', 'kangaroo']
+
'''
- _heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
- _len = len
h = []
h_append = h.append
- for itnum, it in enumerate(map(iter, iterables)):
+
+ if reverse:
+ _heapify = _heapify_max
+ _heappop = _heappop_max
+ _heapreplace = _heapreplace_max
+ direction = -1
+ else:
+ _heapify = heapify
+ _heappop = heappop
+ _heapreplace = heapreplace
+ direction = 1
+
+ if key is None:
+ for order, it in enumerate(map(iter, iterables)):
+ try:
+ next = it.__next__
+ h_append([next(), order * direction, next])
+ except StopIteration:
+ pass
+ _heapify(h)
+ while len(h) > 1:
+ try:
+ while True:
+ value, order, next = s = h[0]
+ yield value
+ s[0] = next() # raises StopIteration when exhausted
+ _heapreplace(h, s) # restore heap condition
+ except StopIteration:
+ _heappop(h) # remove empty iterator
+ if h:
+ # fast case when only a single iterator remains
+ value, order, next = h[0]
+ yield value
+ yield from next.__self__
+ return
+
+ for order, it in enumerate(map(iter, iterables)):
try:
next = it.__next__
- h_append([next(), itnum, next])
- except _StopIteration:
+ value = next()
+ h_append([key(value), order * direction, value, next])
+ except StopIteration:
pass
- heapify(h)
-
- while _len(h) > 1:
+ _heapify(h)
+ while len(h) > 1:
try:
while True:
- v, itnum, next = s = h[0]
- yield v
- s[0] = next() # raises StopIteration when exhausted
- _heapreplace(h, s) # restore heap condition
- except _StopIteration:
- _heappop(h) # remove empty iterator
+ key_value, order, value, next = s = h[0]
+ yield value
+ value = next()
+ s[0] = key(value)
+ s[2] = value
+ _heapreplace(h, s)
+ except StopIteration:
+ _heappop(h)
if h:
- # fast case when only a single iterator remains
- v, itnum, next = h[0]
- yield v
+ key_value, order, value, next = h[0]
+ yield value
yield from next.__self__
-# Extend the implementations of nsmallest and nlargest to use a key= argument
-_nsmallest = nsmallest
+
+# Algorithm notes for nlargest() and nsmallest()
+# ==============================================
+#
+# Make a single pass over the data while keeping the k most extreme values
+# in a heap. Memory consumption is limited to keeping k values in a list.
+#
+# Measured performance for random inputs:
+#
+# number of comparisons
+# n inputs k-extreme values (average of 5 trials) % more than min()
+# ------------- ---------------- --------------------- -----------------
+# 1,000 100 3,317 231.7%
+# 10,000 100 14,046 40.5%
+# 100,000 100 105,749 5.7%
+# 1,000,000 100 1,007,751 0.8%
+# 10,000,000 100 10,009,401 0.1%
+#
+# Theoretical number of comparisons for k smallest of n random inputs:
+#
+# Step Comparisons Action
+# ---- -------------------------- ---------------------------
+# 1 1.66 * k heapify the first k-inputs
+# 2 n - k compare remaining elements to top of heap
+# 3 k * (1 + lg2(k)) * ln(n/k) replace the topmost value on the heap
+# 4 k * lg2(k) - (k/2) final sort of the k most extreme values
+#
+# Combining and simplifying for a rough estimate gives:
+#
+# comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k))
+#
+# Computing the number of comparisons for step 3:
+# -----------------------------------------------
+# * For the i-th new value from the iterable, the probability of being in the
+# k most extreme values is k/i. For example, the probability of the 101st
+# value seen being in the 100 most extreme values is 100/101.
+# * If the value is a new extreme value, the cost of inserting it into the
+# heap is 1 + log(k, 2).
+# * The probability times the cost gives:
+# (k/i) * (1 + log(k, 2))
+# * Summing across the remaining n-k elements gives:
+# sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1))
+# * This reduces to:
+# (H(n) - H(k)) * k * (1 + log(k, 2))
+# * Where H(n) is the n-th harmonic number estimated by:
+# gamma = 0.5772156649
+# H(n) = log(n, e) + gamma + 1 / (2 * n)
+# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
+# * Substituting the H(n) formula:
+# comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2)
+#
+# Worst-case for step 3:
+# ----------------------
+# In the worst case, the input data is reversed sorted so that every new element
+# must be inserted in the heap:
+#
+# comparisons = 1.66 * k + log(k, 2) * (n - k)
+#
+# Alternative Algorithms
+# ----------------------
+# Other algorithms were not used because they:
+# 1) Took much more auxiliary memory,
+# 2) Made multiple passes over the data.
+# 3) Made more comparisons in common cases (small k, large n, semi-random input).
+# See the more detailed comparison of approach at:
+# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
+
def nsmallest(n, iterable, key=None):
"""Find the n smallest elements in a dataset.
Equivalent to: sorted(iterable, key=key)[:n]
"""
- # Short-cut for n==1 is to use min() when len(iterable)>0
+
+ # Short-cut for n==1 is to use min()
if n == 1:
it = iter(iterable)
- head = list(islice(it, 1))
- if not head:
- return []
+ sentinel = object()
if key is None:
- return [min(chain(head, it))]
- return [min(chain(head, it), key=key)]
+ result = min(it, default=sentinel)
+ else:
+ result = min(it, default=sentinel, key=key)
+ return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
@@ -413,32 +485,57 @@ def nsmallest(n, iterable, key=None):
# When key is none, use simpler decoration
if key is None:
- it = zip(iterable, count()) # decorate
- result = _nsmallest(n, it)
- return [r[0] for r in result] # undecorate
+ it = iter(iterable)
+ # put the range(n) first so that zip() doesn't
+ # consume one too many elements from the iterator
+ result = [(elem, i) for i, elem in zip(range(n), it)]
+ if not result:
+ return result
+ _heapify_max(result)
+ top = result[0][0]
+ order = n
+ _heapreplace = _heapreplace_max
+ for elem in it:
+ if elem < top:
+ _heapreplace(result, (elem, order))
+ top = result[0][0]
+ order += 1
+ result.sort()
+ return [r[0] for r in result]
# General case, slowest method
- in1, in2 = tee(iterable)
- it = zip(map(key, in1), count(), in2) # decorate
- result = _nsmallest(n, it)
- return [r[2] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(key(elem), i, elem) for i, elem in zip(range(n), it)]
+ if not result:
+ return result
+ _heapify_max(result)
+ top = result[0][0]
+ order = n
+ _heapreplace = _heapreplace_max
+ for elem in it:
+ k = key(elem)
+ if k < top:
+ _heapreplace(result, (k, order, elem))
+ top = result[0][0]
+ order += 1
+ result.sort()
+ return [r[2] for r in result]
-_nlargest = nlargest
def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, key=key, reverse=True)[:n]
"""
- # Short-cut for n==1 is to use max() when len(iterable)>0
+ # Short-cut for n==1 is to use max()
if n == 1:
it = iter(iterable)
- head = list(islice(it, 1))
- if not head:
- return []
+ sentinel = object()
if key is None:
- return [max(chain(head, it))]
- return [max(chain(head, it), key=key)]
+ result = max(it, default=sentinel)
+ else:
+ result = max(it, default=sentinel, key=key)
+ return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
@@ -451,26 +548,60 @@ def nlargest(n, iterable, key=None):
# When key is none, use simpler decoration
if key is None:
- it = zip(iterable, count(0,-1)) # decorate
- result = _nlargest(n, it)
- return [r[0] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]
+ if not result:
+ return result
+ heapify(result)
+ top = result[0][0]
+ order = -n
+ _heapreplace = heapreplace
+ for elem in it:
+ if top < elem:
+ _heapreplace(result, (elem, order))
+ top = result[0][0]
+ order -= 1
+ result.sort(reverse=True)
+ return [r[0] for r in result]
# General case, slowest method
- in1, in2 = tee(iterable)
- it = zip(map(key, in1), count(0,-1), in2) # decorate
- result = _nlargest(n, it)
- return [r[2] for r in result] # undecorate
+ it = iter(iterable)
+ result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
+ if not result:
+ return result
+ heapify(result)
+ top = result[0][0]
+ order = -n
+ _heapreplace = heapreplace
+ for elem in it:
+ k = key(elem)
+ if top < k:
+ _heapreplace(result, (k, order, elem))
+ top = result[0][0]
+ order -= 1
+ result.sort(reverse=True)
+ return [r[2] for r in result]
+
+# If available, use C implementation
+try:
+ from _heapq import *
+except ImportError:
+ pass
+try:
+ from _heapq import _heapreplace_max
+except ImportError:
+ pass
+try:
+ from _heapq import _heapify_max
+except ImportError:
+ pass
+try:
+ from _heapq import _heappop_max
+except ImportError:
+ pass
+
if __name__ == "__main__":
- # Simple sanity test
- heap = []
- data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0]
- for item in data:
- heappush(heap, item)
- sort = []
- while heap:
- sort.append(heappop(heap))
- print(sort)
import doctest
- doctest.testmod()
+ print(doctest.testmod())