3 files changed, 245 insertions, 35 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 180a06f33e..d5651db6d8 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -161,11 +161,23 @@ static void  initMBlock(void *mblock, uint32_t node);
 static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
 static bdescr *free_mblock_list[MAX_NUMA_NODES];
 
+// For avoiding quadratic runtime performance when freeing a large number of
+// mblocks during a single GC run, free will be deferred to a separate free list
+// that foregoes sorting and coalecense. As the final step in a GC run we can
+// then separately sort the deferred list, and merge it with the actual free
+// list in one go.
+static bool defer_mblock_frees;
+static bdescr *deferred_free_mblock_list[MAX_NUMA_NODES];
+
 W_ n_alloc_blocks;   // currently allocated blocks
 W_ hw_alloc_blocks;  // high-water allocated blocks
 
 W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
 
+
+static bdescr* splitDeferredList(bdescr* head);
+static void sortDeferredList(bdescr** head);
+
 /* -----------------------------------------------------------------------------
    Initialisation
    -------------------------------------------------------------------------- */
@@ -371,41 +383,79 @@ split_block_low (bdescr *bd, W_ n)
     return bd;
 }
 
-/* Only initializes the start pointers on the first megablock and the
- * blocks field of the first bdescr; callers are responsible for calling
- * initGroup afterwards.
+
+/* Find a fitting block for the allocation request in the given free list.
+   Returns:
+     - not NULL: when an exact match was found in the free list.
+     - NULL: when no exact match was found. In this case, the out parameter
+       `best` can be inspected to get the best fitting block from the free list.
  */
 static bdescr *
-alloc_mega_group (uint32_t node, StgWord mblocks)
+alloc_mega_group_from_free_list (bdescr** free_list_head, StgWord n, bdescr** best)
 {
-    bdescr *best, *bd, *prev;
-    StgWord n;
-
-    n = MBLOCK_GROUP_BLOCKS(mblocks);
-
-    best = NULL;
+    bdescr *bd, *prev;
+    *best = NULL;
     prev = NULL;
-    for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
+    for (bd = *free_list_head; bd != NULL; prev = bd, bd = bd->link)
     {
         if (bd->blocks == n)
         {
             if (prev) {
                 prev->link = bd->link;
             } else {
-                free_mblock_list[node] = bd->link;
+                *free_list_head = bd->link;
             }
             return bd;
         }
         else if (bd->blocks > n)
         {
-            if (!best || bd->blocks < best->blocks)
+            if (!*best || bd->blocks < (*best)->blocks)
             {
-                best = bd;
+                *best = bd;
             }
         }
     }
+    return NULL;
+}
 
-    if (best)
+/* Only initializes the start pointers on the first megablock and the
+ * blocks field of the first bdescr; callers are responsible for calling
+ * initGroup afterwards.
+ */
+static bdescr *
+alloc_mega_group (uint32_t node, StgWord mblocks)
+{
+    bdescr *best, *bd;
+    StgWord n;
+
+    n = MBLOCK_GROUP_BLOCKS(mblocks);
+
+    if(defer_mblock_frees)
+    {
+        // Freeing mega groups is currently deferred. Try to serve new requests
+        // preferentially from our deferred free list.
+        bd = alloc_mega_group_from_free_list(&deferred_free_mblock_list[node], n, &best);
+        if(bd)
+        {
+            return bd;
+        }
+        else if(!best)
+        {
+            // If there was neither an exact nor a best match, try the regular free list.
+            bd = alloc_mega_group_from_free_list(&free_mblock_list[node], n, &best);
+        }
+    }
+    else
+    {
+        // Otherwise, just always use the regular free list
+        bd = alloc_mega_group_from_free_list(&free_mblock_list[node], n, &best);
+    }
+
+    if (bd)
+    {
+        return bd;
+    }
+    else if (best)
     {
         // we take our chunk off the end here.
         StgWord best_mblocks  = BLOCKS_TO_MBLOCKS(best->blocks);
@@ -758,30 +808,90 @@ free_mega_group (bdescr *mg)
     bdescr *bd, *prev;
     uint32_t node;
 
-    // Find the right place in the free list.  free_mblock_list is
-    // sorted by *address*, not by size as the free_list is.
-    prev = NULL;
     node = mg->node;
-    bd = free_mblock_list[node];
-    while (bd && bd->start < mg->start) {
-        prev = bd;
-        bd = bd->link;
-    }
 
-    // coalesce backwards
-    if (prev)
-    {
-        mg->link = prev->link;
-        prev->link = mg;
-        mg = coalesce_mblocks(prev);
+    if(defer_mblock_frees) {
+        // Put the block on the deferred free list without coalescing.
+        mg->link = deferred_free_mblock_list[node];
+        deferred_free_mblock_list[node] = mg;
+    } else {
+        // Find the right place in the free list.  free_mblock_list is
+        // sorted by *address*, not by size as the free_list is.
+        prev = NULL;
+        bd = free_mblock_list[node];
+        while (bd && bd->start < mg->start) {
+            prev = bd;
+            bd = bd->link;
+        }
+
+        // coalesce backwards
+        if (prev)
+        {
+            mg->link = prev->link;
+            prev->link = mg;
+            mg = coalesce_mblocks(prev);
+        }
+        else
+        {
+            mg->link = free_mblock_list[node];
+            free_mblock_list[node] = mg;
+        }
+        // coalesce forwards
+        coalesce_mblocks(mg);
+
+        IF_DEBUG(sanity, checkFreeListSanity());
     }
-    else
-    {
-        mg->link = free_mblock_list[node];
-        free_mblock_list[node] = mg;
+}
+
+static void
+free_deferred_mega_groups (uint32_t node)
+{
+    bdescr *mg, *bd, *prev, *new_head;
+
+    sortDeferredList(&deferred_free_mblock_list[node]);
+
+    new_head = deferred_free_mblock_list[node];
+    deferred_free_mblock_list[node] = NULL;
+
+    // Keeping track of the location in the free list
+    prev = NULL;
+    bd = free_mblock_list[node];
+
+    while(new_head != NULL) {
+        // Obtain mblock to free
+        mg = new_head;
+        new_head = new_head->link;
+
+        // Find the right place in the free list. This is similar to the process
+        // in `free_mega_group`, but we can exploit that the deferred list is
+        // sorted: the search starts out where the previous mblock was inserted.
+        // This means we only need to traverse the free list once to free all
+        // the mblocks, rather than once per mblock.
+        while (bd && bd->start < mg->start) {
+            prev = bd;
+            bd = bd->link;
+        }
+
+        // coalesce backwards
+        if (prev)
+        {
+            mg->link = prev->link;
+            prev->link = mg;
+            mg = coalesce_mblocks(prev);
+        }
+        else
+        {
+            mg->link = free_mblock_list[node];
+            free_mblock_list[node] = mg;
+        }
+
+        // coalesce forwards
+        coalesce_mblocks(mg);
+
+        // initialize search for next round
+        prev = mg;
+        bd = prev->link;
     }
-    // coalesce forwards
-    coalesce_mblocks(mg);
 
     IF_DEBUG(sanity, checkFreeListSanity());
 }
@@ -949,6 +1059,93 @@ initMBlock(void *mblock, uint32_t node)
     }
 }
 
+// Find the midpoint of the linked list.
+static bdescr* splitDeferredList(bdescr* head) {
+    bdescr *fast, *slow, *second_half;
+
+    slow = head;
+    fast = slow->link;
+
+    while(fast != NULL) {
+        fast = fast->link;
+        if(fast) {
+            fast = fast->link;
+            slow = slow->link;
+        }
+    }
+
+    second_half = slow->link;
+    // Cap first list at half
+    slow->link = NULL;
+    return second_half;
+}
+
+static void sortDeferredList(bdescr** head) {
+    bdescr *first_half, *second_half, *cur;
+
+    if(*head == NULL || (*head)->link == NULL) {
+        // 0 or 1 elements, done
+        return;
+    }
+
+    first_half = *head;
+    second_half = splitDeferredList(*head);
+
+    sortDeferredList(&first_half);
+    sortDeferredList(&second_half);
+
+    // Sort by address
+    if(first_half->start < second_half->start) {
+        *head = first_half;
+        first_half = first_half->link;
+    } else {
+        *head = second_half;
+        second_half = second_half->link;
+    }
+    cur = *head;
+
+    while(first_half != NULL && second_half != NULL) {
+        if(first_half->start < second_half->start) {
+            cur->link = first_half;
+            first_half = first_half->link;
+        } else {
+            cur->link = second_half;
+            second_half = second_half->link;
+        }
+        cur = cur->link;
+    }
+
+    // Now one of the two is exhausted, so order doesn't matter
+    while(first_half != NULL) {
+        cur->link = first_half;
+        first_half = first_half->link;
+        cur = cur->link;
+    }
+    while(second_half != NULL) {
+        cur->link = second_half;
+        second_half = second_half->link;
+        cur = cur->link;
+    }
+}
+
+void deferMBlockFreeing() {
+    if(defer_mblock_frees) {
+        barf("MBlock freeing is already deferred");
+    }
+    defer_mblock_frees = true;
+}
+
+void commitMBlockFreeing() {
+    if(! defer_mblock_frees) {
+        barf("MBlock freeing was never deferred");
+    }
+    defer_mblock_frees = false;
+
+    for(uint32_t node = 0; node < n_numa_nodes; node++) {
+        free_deferred_mega_groups(node);
+    }
+}
+
 /* -----------------------------------------------------------------------------
    Stats / metrics
    -------------------------------------------------------------------------- */
diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h
index f28e35fc87..addee6cb83 100644
--- a/rts/sm/BlockAlloc.h
+++ b/rts/sm/BlockAlloc.h
@@ -13,6 +13,9 @@
 bdescr *allocLargeChunk (W_ min, W_ max);
 bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max);
 
+void deferMBlockFreeing(void);
+void commitMBlockFreeing(void);
+
 /* Debugging  -------------------------------------------------------------- */
 
 extern W_ countBlocks       (bdescr *bd);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 15aef3a9fc..71c1ecbfeb 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -442,6 +442,11 @@ GarbageCollect (uint32_t collect_gen,
   memInventory(DEBUG_gc);
 #endif
 
+  // Defer all free calls for the megablock allocator to avoid quadratic runtime
+  // explosion when freeing a lot of memory in a single GC
+  // (https://gitlab.haskell.org/ghc/ghc/-/issues/19897).
+  deferMBlockFreeing();
+
   // do this *before* we start scavenging
   collectFreshWeakPtrs();
 
@@ -977,6 +982,11 @@ GarbageCollect (uint32_t collect_gen,
   resurrectThreads(resurrected_threads);
   ACQUIRE_SM_LOCK;
 
+  // Finally free the deferred mblocks by sorting the deferred free list and
+  // merging it into the actual sorted free list. This needs to happen here so
+  // that the `returnMemoryToOS` call down below can successfully free memory.
+  commitMBlockFreeing();
+
   if (major_gc) {
       W_ need_prealloc, need_live, need, got;
       uint32_t i;