1 files changed, 240 insertions, 109 deletions
diff --git a/Lib/heapq.py b/Lib/heapq.py
index d615239b94..07af37e717 100644
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -127,8 +127,6 @@ From all times, sorting has always been a Great Art! :-)
 __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge',
            'nlargest', 'nsmallest', 'heappushpop']
 
-from itertools import islice, count, tee, chain
-
 def heappush(heap, item):
     """Push item onto heap, maintaining the heap invariant."""
     heap.append(item)
@@ -141,9 +139,8 @@ def heappop(heap):
         returnitem = heap[0]
         heap[0] = lastelt
         _siftup(heap, 0)
-    else:
-        returnitem = lastelt
-    return returnitem
+        return returnitem
+    return lastelt
 
 def heapreplace(heap, item):
     """Pop and return the current smallest value, and add the new item.
@@ -179,12 +176,22 @@ def heapify(x):
     for i in reversed(range(n//2)):
         _siftup(x, i)
 
-def _heappushpop_max(heap, item):
-    """Maxheap version of a heappush followed by a heappop."""
-    if heap and item < heap[0]:
-        item, heap[0] = heap[0], item
+def _heappop_max(heap):
+    """Maxheap version of a heappop."""
+    lastelt = heap.pop()    # raises appropriate IndexError if heap is empty
+    if heap:
+        returnitem = heap[0]
+        heap[0] = lastelt
         _siftup_max(heap, 0)
-    return item
+        return returnitem
+    return lastelt
+
+def _heapreplace_max(heap, item):
+    """Maxheap version of a heappop followed by a heappush."""
+    returnitem = heap[0]    # raises appropriate IndexError if heap is empty
+    heap[0] = item
+    _siftup_max(heap, 0)
+    return returnitem
 
 def _heapify_max(x):
     """Transform list into a maxheap, in-place, in O(len(x)) time."""
@@ -192,42 +199,6 @@ def _heapify_max(x):
     for i in reversed(range(n//2)):
         _siftup_max(x, i)
 
-def nlargest(n, iterable):
-    """Find the n largest elements in a dataset.
-
-    Equivalent to:  sorted(iterable, reverse=True)[:n]
-    """
-    if n < 0:
-        return []
-    it = iter(iterable)
-    result = list(islice(it, n))
-    if not result:
-        return result
-    heapify(result)
-    _heappushpop = heappushpop
-    for elem in it:
-        _heappushpop(result, elem)
-    result.sort(reverse=True)
-    return result
-
-def nsmallest(n, iterable):
-    """Find the n smallest elements in a dataset.
-
-    Equivalent to:  sorted(iterable)[:n]
-    """
-    if n < 0:
-        return []
-    it = iter(iterable)
-    result = list(islice(it, n))
-    if not result:
-        return result
-    _heapify_max(result)
-    _heappushpop = _heappushpop_max
-    for elem in it:
-        _heappushpop(result, elem)
-    result.sort()
-    return result
-
 # 'heap' is a heap at all indices >= startpos, except possibly for pos.  pos
 # is the index of a leaf with a possibly out-of-order value.  Restore the
 # heap invariant.
@@ -340,13 +311,7 @@ def _siftup_max(heap, pos):
     heap[pos] = newitem
     _siftdown_max(heap, startpos, pos)
 
-# If available, use C implementation
-try:
-    from _heapq import *
-except ImportError:
-    pass
-
-def merge(*iterables):
+def merge(*iterables, key=None, reverse=False):
     '''Merge multiple sorted inputs into a single sorted output.
 
     Similar to sorted(itertools.chain(*iterables)) but returns a generator,
@@ -356,51 +321,158 @@ def merge(*iterables):
     >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
     [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
 
+    If *key* is not None, applies a key function to each element to determine
+    its sort order.
+
+    >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
+    ['dog', 'cat', 'fish', 'horse', 'kangaroo']
+
     '''
-    _heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
-    _len = len
 
     h = []
     h_append = h.append
-    for itnum, it in enumerate(map(iter, iterables)):
+
+    if reverse:
+        _heapify = _heapify_max
+        _heappop = _heappop_max
+        _heapreplace = _heapreplace_max
+        direction = -1
+    else:
+        _heapify = heapify
+        _heappop = heappop
+        _heapreplace = heapreplace
+        direction = 1
+
+    if key is None:
+        for order, it in enumerate(map(iter, iterables)):
+            try:
+                next = it.__next__
+                h_append([next(), order * direction, next])
+            except StopIteration:
+                pass
+        _heapify(h)
+        while len(h) > 1:
+            try:
+                while True:
+                    value, order, next = s = h[0]
+                    yield value
+                    s[0] = next()           # raises StopIteration when exhausted
+                    _heapreplace(h, s)      # restore heap condition
+            except StopIteration:
+                _heappop(h)                 # remove empty iterator
+        if h:
+            # fast case when only a single iterator remains
+            value, order, next = h[0]
+            yield value
+            yield from next.__self__
+        return
+
+    for order, it in enumerate(map(iter, iterables)):
         try:
             next = it.__next__
-            h_append([next(), itnum, next])
-        except _StopIteration:
+            value = next()
+            h_append([key(value), order * direction, value, next])
+        except StopIteration:
             pass
-    heapify(h)
-
-    while _len(h) > 1:
+    _heapify(h)
+    while len(h) > 1:
         try:
             while True:
-                v, itnum, next = s = h[0]
-                yield v
-                s[0] = next()               # raises StopIteration when exhausted
-                _heapreplace(h, s)          # restore heap condition
-        except _StopIteration:
-            _heappop(h)                     # remove empty iterator
+                key_value, order, value, next = s = h[0]
+                yield value
+                value = next()
+                s[0] = key(value)
+                s[2] = value
+                _heapreplace(h, s)
+        except StopIteration:
+            _heappop(h)
     if h:
-        # fast case when only a single iterator remains
-        v, itnum, next = h[0]
-        yield v
+        key_value, order, value, next = h[0]
+        yield value
         yield from next.__self__
 
-# Extend the implementations of nsmallest and nlargest to use a key= argument
-_nsmallest = nsmallest
+
+# Algorithm notes for nlargest() and nsmallest()
+# ==============================================
+#
+# Make a single pass over the data while keeping the k most extreme values
+# in a heap.  Memory consumption is limited to keeping k values in a list.
+#
+# Measured performance for random inputs:
+#
+#                                   number of comparisons
+#    n inputs     k-extreme values  (average of 5 trials)   % more than min()
+# -------------   ----------------  ---------------------   -----------------
+#      1,000           100                  3,317               231.7%
+#     10,000           100                 14,046                40.5%
+#    100,000           100                105,749                 5.7%
+#  1,000,000           100              1,007,751                 0.8%
+# 10,000,000           100             10,009,401                 0.1%
+#
+# Theoretical number of comparisons for k smallest of n random inputs:
+#
+# Step   Comparisons                  Action
+# ----   --------------------------   ---------------------------
+#  1     1.66 * k                     heapify the first k-inputs
+#  2     n - k                        compare remaining elements to top of heap
+#  3     k * (1 + lg2(k)) * ln(n/k)   replace the topmost value on the heap
+#  4     k * lg2(k) - (k/2)           final sort of the k most extreme values
+#
+# Combining and simplifying for a rough estimate gives:
+#
+#        comparisons = n + k * (log(k, 2) * log(n/k) + log(k, 2) + log(n/k))
+#
+# Computing the number of comparisons for step 3:
+# -----------------------------------------------
+# * For the i-th new value from the iterable, the probability of being in the
+#   k most extreme values is k/i.  For example, the probability of the 101st
+#   value seen being in the 100 most extreme values is 100/101.
+# * If the value is a new extreme value, the cost of inserting it into the
+#   heap is 1 + log(k, 2).
+# * The probability times the cost gives:
+#            (k/i) * (1 + log(k, 2))
+# * Summing across the remaining n-k elements gives:
+#            sum((k/i) * (1 + log(k, 2)) for i in range(k+1, n+1))
+# * This reduces to:
+#            (H(n) - H(k)) * k * (1 + log(k, 2))
+# * Where H(n) is the n-th harmonic number estimated by:
+#            gamma = 0.5772156649
+#            H(n) = log(n, e) + gamma + 1 / (2 * n)
+#   http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
+# * Substituting the H(n) formula:
+#            comparisons = k * (1 + log(k, 2)) * (log(n/k, e) + (1/n - 1/k) / 2)
+#
+# Worst-case for step 3:
+# ----------------------
+# In the worst case, the input data is reversed sorted so that every new element
+# must be inserted in the heap:
+#
+#             comparisons = 1.66 * k + log(k, 2) * (n - k)
+#
+# Alternative Algorithms
+# ----------------------
+# Other algorithms were not used because they:
+# 1) Took much more auxiliary memory,
+# 2) Made multiple passes over the data.
+# 3) Made more comparisons in common cases (small k, large n, semi-random input).
+# See the more detailed comparison of approach at:
+# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
+
 def nsmallest(n, iterable, key=None):
     """Find the n smallest elements in a dataset.
 
     Equivalent to:  sorted(iterable, key=key)[:n]
     """
-    # Short-cut for n==1 is to use min() when len(iterable)>0
+
+    # Short-cut for n==1 is to use min()
     if n == 1:
         it = iter(iterable)
-        head = list(islice(it, 1))
-        if not head:
-            return []
+        sentinel = object()
         if key is None:
-            return [min(chain(head, it))]
-        return [min(chain(head, it), key=key)]
+            result = min(it, default=sentinel)
+        else:
+            result = min(it, default=sentinel, key=key)
+        return [] if result is sentinel else [result]
 
     # When n>=size, it's faster to use sorted()
     try:
@@ -413,32 +485,57 @@ def nsmallest(n, iterable, key=None):
 
     # When key is none, use simpler decoration
     if key is None:
-        it = zip(iterable, count())                         # decorate
-        result = _nsmallest(n, it)
-        return [r[0] for r in result]                       # undecorate
+        it = iter(iterable)
+        # put the range(n) first so that zip() doesn't
+        # consume one too many elements from the iterator
+        result = [(elem, i) for i, elem in zip(range(n), it)]
+        if not result:
+            return result
+        _heapify_max(result)
+        top = result[0][0]
+        order = n
+        _heapreplace = _heapreplace_max
+        for elem in it:
+            if elem < top:
+                _heapreplace(result, (elem, order))
+                top = result[0][0]
+                order += 1
+        result.sort()
+        return [r[0] for r in result]
 
     # General case, slowest method
-    in1, in2 = tee(iterable)
-    it = zip(map(key, in1), count(), in2)                   # decorate
-    result = _nsmallest(n, it)
-    return [r[2] for r in result]                           # undecorate
+    it = iter(iterable)
+    result = [(key(elem), i, elem) for i, elem in zip(range(n), it)]
+    if not result:
+        return result
+    _heapify_max(result)
+    top = result[0][0]
+    order = n
+    _heapreplace = _heapreplace_max
+    for elem in it:
+        k = key(elem)
+        if k < top:
+            _heapreplace(result, (k, order, elem))
+            top = result[0][0]
+            order += 1
+    result.sort()
+    return [r[2] for r in result]
 
-_nlargest = nlargest
 def nlargest(n, iterable, key=None):
     """Find the n largest elements in a dataset.
 
     Equivalent to:  sorted(iterable, key=key, reverse=True)[:n]
     """
 
-    # Short-cut for n==1 is to use max() when len(iterable)>0
+    # Short-cut for n==1 is to use max()
     if n == 1:
         it = iter(iterable)
-        head = list(islice(it, 1))
-        if not head:
-            return []
+        sentinel = object()
         if key is None:
-            return [max(chain(head, it))]
-        return [max(chain(head, it), key=key)]
+            result = max(it, default=sentinel)
+        else:
+            result = max(it, default=sentinel, key=key)
+        return [] if result is sentinel else [result]
 
     # When n>=size, it's faster to use sorted()
     try:
@@ -451,26 +548,60 @@ def nlargest(n, iterable, key=None):
 
     # When key is none, use simpler decoration
     if key is None:
-        it = zip(iterable, count(0,-1))                     # decorate
-        result = _nlargest(n, it)
-        return [r[0] for r in result]                       # undecorate
+        it = iter(iterable)
+        result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]
+        if not result:
+            return result
+        heapify(result)
+        top = result[0][0]
+        order = -n
+        _heapreplace = heapreplace
+        for elem in it:
+            if top < elem:
+                _heapreplace(result, (elem, order))
+                top = result[0][0]
+                order -= 1
+        result.sort(reverse=True)
+        return [r[0] for r in result]
 
     # General case, slowest method
-    in1, in2 = tee(iterable)
-    it = zip(map(key, in1), count(0,-1), in2)               # decorate
-    result = _nlargest(n, it)
-    return [r[2] for r in result]                           # undecorate
+    it = iter(iterable)
+    result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
+    if not result:
+        return result
+    heapify(result)
+    top = result[0][0]
+    order = -n
+    _heapreplace = heapreplace
+    for elem in it:
+        k = key(elem)
+        if top < k:
+            _heapreplace(result, (k, order, elem))
+            top = result[0][0]
+            order -= 1
+    result.sort(reverse=True)
+    return [r[2] for r in result]
+
+# If available, use C implementation
+try:
+    from _heapq import *
+except ImportError:
+    pass
+try:
+    from _heapq import _heapreplace_max
+except ImportError:
+    pass
+try:
+    from _heapq import _heapify_max
+except ImportError:
+    pass
+try:
+    from _heapq import _heappop_max
+except ImportError:
+    pass
+
 
 if __name__ == "__main__":
-    # Simple sanity test
-    heap = []
-    data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 0]
-    for item in data:
-        heappush(heap, item)
-    sort = []
-    while heap:
-        sort.append(heappop(heap))
-    print(sort)
 
     import doctest
-    doctest.testmod()
+    print(doctest.testmod())