From 9989cc67549b003db63c90f11b29d89f896323cb Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Thu, 23 Apr 2020 12:35:40 -0400
Subject: rts: Eliminate GC spinlock

---
 includes/rts/storage/MBlock.h |  5 -----
 rts/Stats.c                   |  9 ---------
 rts/sm/GCUtils.c              | 28 ++++++----------------------
 rts/sm/HeapAlloc.h            |  4 ++--
 rts/sm/NonMoving.c            |  5 +----
 rts/sm/NonMovingMark.c        |  5 +++--
 rts/sm/Storage.c              |  6 ------
 7 files changed, 12 insertions(+), 50 deletions(-)

diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h
index 3acefda9a0..38789e7863 100644
--- a/includes/rts/storage/MBlock.h
+++ b/includes/rts/storage/MBlock.h
@@ -25,8 +25,3 @@ extern void freeAllMBlocks(void);
 
 extern void *getFirstMBlock(void **state);
 extern void *getNextMBlock(void **state, void *mblock);
-
-#if defined(THREADED_RTS)
-// needed for HEAP_ALLOCED below
-extern SpinLock gc_alloc_block_sync;
-#endif
diff --git a/rts/Stats.c b/rts/Stats.c
index 2fbd8b2655..8d4bd41b31 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -937,11 +937,6 @@ static void report_summary(const RTSSummaryStats* sum)
                     , col_width[1], "SpinLock"
                     , col_width[2], "Spins"
                     , col_width[3], "Yields");
-        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                    , col_width[0], ""
-                    , col_width[1], "gc_alloc_block_sync"
-                    , col_width[2], gc_alloc_block_sync.spin
-                    , col_width[3], gc_alloc_block_sync.yield);
         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
                     , col_width[0], ""
                     , col_width[1], "gc_spin"
@@ -1124,10 +1119,6 @@ static void report_machine_readable (const RTSSummaryStats * sum)
 
     // next, internal counters
 #if defined(PROF_SPIN)
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
-    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
-            gc_alloc_block_sync.yield);
-    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
     MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
     MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
     MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 02c26ddf5e..cc64ede466 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -26,27 +26,15 @@
 #include "WSDeque.h"
 #endif
 
-#if defined(THREADED_RTS)
-SpinLock gc_alloc_block_sync;
-#endif
-
 bdescr* allocGroup_sync(uint32_t n)
 {
-    bdescr *bd;
     uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
+    return allocGroupOnNode_lock(node,n);
 }
 
 bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n)
 {
-    bdescr *bd;
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroupOnNode(node,n);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-    return bd;
+    return allocGroupOnNode_lock(node,n);
 }
 
 static uint32_t
@@ -55,7 +43,7 @@ allocBlocks_sync(uint32_t n, bdescr **hd)
     bdescr *bd;
     uint32_t i;
     uint32_t node = capNoToNumaNode(gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+    ACQUIRE_SM_LOCK;
     bd = allocLargeChunkOnNode(node,1,n);
     // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a
     // fragmentation-friendly way.
@@ -68,7 +56,7 @@ allocBlocks_sync(uint32_t n, bdescr **hd)
     bd[n-1].link = NULL;
     // We have to hold the lock until we've finished fiddling with the metadata,
     // otherwise the block allocator can get confused.
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    RELEASE_SM_LOCK;
     *hd = bd;
     return n;
 }
@@ -76,17 +64,13 @@ allocBlocks_sync(uint32_t n, bdescr **hd)
 void
 freeChain_sync(bdescr *bd)
 {
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeChain(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    freeChain_lock(bd);
 }
 
 void
 freeGroup_sync(bdescr *bd)
 {
-    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    freeGroup(bd);
-    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    freeGroup_lock(bd);
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/sm/HeapAlloc.h b/rts/sm/HeapAlloc.h
index 58aae1119d..e05bd41f87 100644
--- a/rts/sm/HeapAlloc.h
+++ b/rts/sm/HeapAlloc.h
@@ -210,9 +210,9 @@ StgBool HEAP_ALLOCED_GC(const void *p)
     } else {
         // putting the rest out of line turned out to be a slight
         // performance improvement:
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK;
         b = HEAP_ALLOCED_miss(mblock,p);
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
         return b;
     }
 }
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index bdf22dc37b..6e9d4e5375 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -467,8 +467,7 @@ unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
  *
- * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock
- * under the assumption that we are in a GC context).
+ * Caller must hold SM_MUTEX.
  */
 static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 {
@@ -480,12 +479,10 @@ static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
     if (ret == NULL) {
         // Take gc spinlock: another thread may be scavenging a moving
         // generation and call `todo_block_full`
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
         bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS);
         // See Note [Live data accounting in nonmoving collector].
         oldest_gen->n_blocks += bd->blocks;
         oldest_gen->n_words  += BLOCK_SIZE_W * bd->blocks;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 
         for (StgWord32 i = 0; i < bd->blocks; ++i) {
             initBdescr(&bd[i], oldest_gen, oldest_gen);
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 3113e7f013..475762beeb 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -442,6 +442,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     q->top->head++;
 }
 
+// TODO: Eliminate this
 /* A variant of push to be used by the minor GC when it encounters a reference
  * to an object in the non-moving heap. In contrast to the other push
  * operations this uses the gc_alloc_block_sync spinlock instead of the
@@ -459,13 +460,13 @@ markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
         // allocate a fresh block.
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        ACQUIRE_SM_LOCK;
         bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
         bd->link = q->blocks;
         q->blocks = bd;
         q->top = (MarkQueueBlock *) bd->start;
         q->top->head = 0;
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        RELEASE_SM_LOCK;
     }
 
     MarkQueueEnt ent = {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 49c367fa8d..1e8cfadbe0 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -174,12 +174,6 @@ initStorage (void)
   // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen
   nonmovingInit();
 
-#if defined(THREADED_RTS)
-  // nonmovingAddCapabilities allocates segments, which requires taking the gc
-  // sync lock, so initialize it before nonmovingAddCapabilities
-  initSpinLock(&gc_alloc_block_sync);
-#endif
-
   if (RtsFlags.GcFlags.useNonmoving)
       nonmovingAddCapabilities(n_capabilities);
 
-- 
cgit v1.2.1