5 files changed, 307 insertions, 61 deletions
diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h
index ecd6bf5dd8..792a72d717 100644
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -290,6 +290,13 @@ EXTERN_INLINE bdescr* allocBlock(void)
 
 bdescr *allocGroupOnNode(uint32_t node, W_ n);
 
+// Allocate n blocks, aligned at n-block boundary. The returned bdescr will
+// have this invariant
+//
+//     bdescr->start % BLOCK_SIZE*n == 0
+//
+bdescr *allocAlignedGroupOnNode(uint32_t node, W_ n);
+
 EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node);
 EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node)
 {
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 7ffd44d22f..8d7acc963e 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -110,6 +110,19 @@ Mutex sched_mutex;
 #define FORKPROCESS_PRIMOP_SUPPORTED
 #endif
 
+/*
+ * sync_finished_cond allows threads which do not own any capability (e.g. the
+ * concurrent mark thread) to participate in the sync protocol. In particular,
+ * if such a thread requests a sync while sync is already in progress it will
+ * block on sync_finished_cond, which will be signalled when the sync is
+ * finished (by releaseAllCapabilities).
+ */
+#if defined(THREADED_RTS)
+static Condition sync_finished_cond;
+static Mutex sync_finished_mutex;
+#endif
+
+
 /* -----------------------------------------------------------------------------
  * static function prototypes
  * -------------------------------------------------------------------------- */
@@ -130,7 +143,6 @@ static void scheduleYield (Capability **pcap, Task *task);
 static bool requestSync (Capability **pcap, Task *task,
                          PendingSync *sync_type, SyncType *prev_sync_type);
 static void acquireAllCapabilities(Capability *cap, Task *task);
-static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task);
 static void startWorkerTasks (uint32_t from USED_IF_THREADS,
                               uint32_t to USED_IF_THREADS);
 #endif
@@ -1368,17 +1380,24 @@ scheduleNeedHeapProfile( bool ready_to_gc )
  * change to the system, such as altering the number of capabilities, or
  * forking.
  *
+ * pCap may be NULL in the event that the caller doesn't yet own a capability.
+ *
  * To resume after stopAllCapabilities(), use releaseAllCapabilities().
  * -------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
-static void stopAllCapabilities (Capability **pCap, Task *task)
+void stopAllCapabilities (Capability **pCap, Task *task)
+{
+    stopAllCapabilitiesWith(pCap, task, SYNC_OTHER);
+}
+
+void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type)
 {
     bool was_syncing;
     SyncType prev_sync_type;
 
     PendingSync sync = {
-        .type = SYNC_OTHER,
+        .type = sync_type,
         .idle = NULL,
         .task = task
     };
@@ -1387,9 +1406,10 @@ static void stopAllCapabilities (Capability **pCap, Task *task)
         was_syncing = requestSync(pCap, task, &sync, &prev_sync_type);
     } while (was_syncing);
 
-    acquireAllCapabilities(*pCap,task);
+    acquireAllCapabilities(pCap ? *pCap : NULL, task);
 
     pending_sync = 0;
+    signalCondition(&sync_finished_cond);
 }
 #endif
 
@@ -1400,6 +1420,16 @@ static void stopAllCapabilities (Capability **pCap, Task *task)
  * directly, instead use stopAllCapabilities().  This is used by the GC, which
  * has some special synchronisation requirements.
  *
+ * Note that this can be called in two ways:
+ *
+ * - where *pcap points to a capability owned by the caller: in this case
+ *   *prev_sync_type will reflect the in-progress sync type on return, if one
+ *   *was found
+ *
+ *  - where pcap == NULL: in this case the caller doesn't hold a capability.
+ *    we only return whether or not a pending sync was found and prev_sync_type
+ *    is unchanged.
+ *
  * Returns:
  *    false if we successfully got a sync
  *    true  if there was another sync request in progress,
@@ -1424,13 +1454,25 @@ static bool requestSync (
         // After the sync is completed, we cannot read that struct any
         // more because it has been freed.
         *prev_sync_type = sync->type;
-        do {
-            debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...",
-                       sync->type);
-            ASSERT(*pcap);
-            yieldCapability(pcap,task,true);
-            sync = pending_sync;
-        } while (sync != NULL);
+        if (pcap == NULL) {
+            // The caller does not hold a capability (e.g. may be a concurrent
+            // mark thread). Consequently we must wait until the pending sync is
+            // finished before proceeding to ensure we don't loop.
+            // TODO: Don't busy-wait
+            ACQUIRE_LOCK(&sync_finished_mutex);
+            while (pending_sync) {
+                waitCondition(&sync_finished_cond, &sync_finished_mutex);
+            }
+            RELEASE_LOCK(&sync_finished_mutex);
+        } else {
+            do {
+                debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...",
+                          sync->type);
+                ASSERT(*pcap);
+                yieldCapability(pcap,task,true);
+                sync = pending_sync;
+            } while (sync != NULL);
+        }
 
         // NOTE: task->cap might have changed now
         return true;
@@ -1445,9 +1487,9 @@ static bool requestSync (
 /* -----------------------------------------------------------------------------
  * acquireAllCapabilities()
  *
- * Grab all the capabilities except the one we already hold.  Used
- * when synchronising before a single-threaded GC (SYNC_SEQ_GC), and
- * before a fork (SYNC_OTHER).
+ * Grab all the capabilities except the one we already hold (cap may be NULL is
+ * the caller does not currently hold a capability). Used when synchronising
+ * before a single-threaded GC (SYNC_SEQ_GC), and before a fork (SYNC_OTHER).
  *
  * Only call this after requestSync(), otherwise a deadlock might
  * ensue if another thread is trying to synchronise.
@@ -1477,29 +1519,30 @@ static void acquireAllCapabilities(Capability *cap, Task *task)
             }
         }
     }
-    task->cap = cap;
+    task->cap = cap == NULL ? tmpcap : cap;
 }
 #endif
 
 /* -----------------------------------------------------------------------------
- * releaseAllcapabilities()
+ * releaseAllCapabilities()
  *
- * Assuming this thread holds all the capabilities, release them all except for
- * the one passed in as cap.
+ * Assuming this thread holds all the capabilities, release them all (except for
+ * the one passed in as keep_cap, if non-NULL).
  * -------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
-static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task)
+void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
 {
     uint32_t i;
 
     for (i = 0; i < n; i++) {
-        if (cap->no != i) {
-            task->cap = capabilities[i];
-            releaseCapability(capabilities[i]);
+        Capability *tmpcap = capabilities[i];
+        if (keep_cap != tmpcap) {
+            task->cap = tmpcap;
+            releaseCapability(tmpcap);
         }
     }
-    task->cap = cap;
+    task->cap = keep_cap;
 }
 #endif
 
@@ -1801,6 +1844,7 @@ delete_threads_and_gc:
     // reset pending_sync *before* GC, so that when the GC threads
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
+    signalCondition(&sync_finished_cond);
     GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
 #else
     GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
diff --git a/rts/Schedule.h b/rts/Schedule.h
index 66cf8391f3..a4772295d9 100644
--- a/rts/Schedule.h
+++ b/rts/Schedule.h
@@ -49,6 +49,12 @@ StgWord findRetryFrameHelper (Capability *cap, StgTSO *tso);
 /* Entry point for a new worker */
 void scheduleWorker (Capability *cap, Task *task);
 
+#if defined(THREADED_RTS)
+void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type);
+void stopAllCapabilities (Capability **pCap, Task *task);
+void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task);
+#endif
+
 /* The state of the scheduler.  This is used to control the sequence
  * of events during shutdown.  See Note [shutdown] in Schedule.c.
  */
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index f9e3d11407..b3e1e2ce75 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -310,7 +310,7 @@ setup_tail (bdescr *bd)
 // Take a free block group bd, and split off a group of size n from
 // it.  Adjust the free list as necessary, and return the new group.
 static bdescr *
-split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
+split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */)
 {
     bdescr *fg; // free group
 
@@ -325,6 +325,46 @@ split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
     return fg;
 }
 
+// Take N blocks off the end, free the rest.
+static bdescr *
+split_block_high (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* ret = bd + bd->blocks - n; // take n blocks off the end
+    ret->blocks = n;
+    ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W;
+    ret->link = NULL;
+
+    bd->blocks -= n;
+
+    setup_tail(ret);
+    setup_tail(bd);
+    freeGroup(bd);
+
+    return ret;
+}
+
+// Like `split_block_high`, but takes n blocks off the beginning rather
+// than the end.
+static bdescr *
+split_block_low (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* bd_ = bd + n;
+    bd_->blocks = bd->blocks - n;
+    bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W;
+
+    bd->blocks = n;
+
+    setup_tail(bd_);
+    setup_tail(bd);
+    freeGroup(bd_);
+
+    return bd;
+}
+
 /* Only initializes the start pointers on the first megablock and the
  * blocks field of the first bdescr; callers are responsible for calling
  * initGroup afterwards.
@@ -461,6 +501,108 @@ finish:
     return bd;
 }
 
+// Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will
+// be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used
+// for easily accessing the beginning of the group from a location p in the
+// group with
+//
+//     p % (BLOCK_SIZE*n)
+//
+// Used by the non-moving collector for allocating segments.
+//
+// Because the storage manager does not support aligned allocations, we have to
+// allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned
+// region in the allocated blocks. After finding the aligned area we want to
+// free slop on the low and high sides, and block allocator doesn't support
+// freeing only some portion of a megablock (we can only free whole megablocks).
+// So we disallow allocating megablocks here, and allow allocating at most
+// `BLOCKS_PER_MBLOCK / 2` blocks.
+bdescr *
+allocAlignedGroupOnNode (uint32_t node, W_ n)
+{
+    // allocate enough blocks to have enough space aligned at n-block boundary
+    // free any slops on the low and high side of this space
+
+    // number of blocks to allocate to make sure we have enough aligned space
+    W_ num_blocks = 2*n - 1;
+
+    if (num_blocks >= BLOCKS_PER_MBLOCK) {
+        barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n"
+             "    requested blocks: %" FMT_Word "\n"
+             "    required for alignment: %" FMT_Word "\n"
+             "    megablock size (in blocks): %" FMT_Word,
+             n, num_blocks, (W_) BLOCKS_PER_MBLOCK);
+    }
+
+    W_ group_size = n * BLOCK_SIZE;
+
+    // To reduce splitting and fragmentation we use allocLargeChunkOnNode here.
+    // Tweak the max allocation to avoid allocating megablocks. Splitting slop
+    // below doesn't work with megablocks (freeGroup can't free only a portion
+    // of a megablock so we can't allocate megablocks and free some parts of
+    // them).
+    W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1);
+    bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks);
+    // We may allocate more than num_blocks, so update it
+    num_blocks = bd->blocks;
+
+    // slop on the low side
+    W_ slop_low = 0;
+    if ((uintptr_t)bd->start % group_size != 0) {
+        slop_low = group_size - ((uintptr_t)bd->start % group_size);
+    }
+
+    W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low;
+
+    ASSERT((slop_low % BLOCK_SIZE) == 0);
+    ASSERT((slop_high % BLOCK_SIZE) == 0);
+
+    W_ slop_low_blocks = slop_low / BLOCK_SIZE;
+    W_ slop_high_blocks = slop_high / BLOCK_SIZE;
+
+    ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks);
+
+#if defined(DEBUG)
+    checkFreeListSanity();
+    W_ free_before = countFreeList();
+#endif
+
+    if (slop_low_blocks != 0) {
+        bd = split_block_high(bd, num_blocks - slop_low_blocks);
+        ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_low_blocks);
+    checkFreeListSanity();
+#endif
+
+    // At this point the bd should be aligned, but we may have slop on the high side
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+#if defined(DEBUG)
+    free_before = countFreeList();
+#endif
+
+    if (slop_high_blocks != 0) {
+        bd = split_block_low(bd, n);
+        ASSERT(bd->blocks == n);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_high_blocks);
+    checkFreeListSanity();
+#endif
+
+    // Should still be aligned
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+    // Just to make sure I get this right
+    ASSERT(Bdescr(bd->start) == bd);
+
+    return bd;
+}
+
 STATIC_INLINE
 uint32_t nodeWithLeastBlocks (void)
 {
diff --git a/testsuite/tests/rts/testblockalloc.c b/testsuite/tests/rts/testblockalloc.c
index 577245f45e..53eed24015 100644
--- a/testsuite/tests/rts/testblockalloc.c
+++ b/testsuite/tests/rts/testblockalloc.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 
 extern bdescr *allocGroup_lock_lock(uint32_t n);
+extern bdescr *allocAlignedGroupOnNode (uint32_t node, W_ n);
 extern void freeGroup_lock(bdescr *p);
 
 const int ARRSIZE  = 256;
@@ -13,64 +14,110 @@ const int SEED     = 0xf00f00;
 
 extern StgWord mblocks_allocated;
 
-int main (int argc, char *argv[])
+static void test_random_alloc(void)
 {
-    int i, j, b;
-
     bdescr *a[ARRSIZE];
 
-    srand(SEED);
+    // repeatedly sweep though the array, allocating new random-sized
+    // objects and deallocating the old ones.
+    for (int i=0; i < LOOPS; i++)
+    {
+        for (int j=0; j < ARRSIZE; j++)
+        {
+            if (i > 0)
+            {
+                IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start));
+                freeGroup_lock(a[j]);
+                DEBUG_ONLY(checkFreeListSanity());
+            }
+
+            int b = (rand() % MAXALLOC) + 1;
+            a[j] = allocGroup_lock(b);
+            IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start));
+            // allocating zero blocks isn't allowed
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+    }
 
+    for (int j=0; j < ARRSIZE; j++)
     {
-        RtsConfig conf = defaultRtsConfig;
-        conf.rts_opts_enabled = RtsOptsAll;
-        hs_init_ghc(&argc, &argv, conf);
+        freeGroup_lock(a[j]);
     }
+}
+
+static void test_sequential_alloc(void)
+{
+    bdescr *a[ARRSIZE];
 
-   // repeatedly sweep though the array, allocating new random-sized
-   // objects and deallocating the old ones.
-   for (i=0; i < LOOPS; i++)
-   {
-       for (j=0; j < ARRSIZE; j++)
-       {
-           if (i > 0)
-           {
-               IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start));
-               freeGroup_lock(a[j]);
-               DEBUG_ONLY(checkFreeListSanity());
-           }
-           b = (rand() % MAXALLOC) + 1;
-           a[j] = allocGroup_lock(b);
-           IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start));
-           // allocating zero blocks isn't allowed
-           DEBUG_ONLY(checkFreeListSanity());
-       }
-   }
-
-   for (j=0; j < ARRSIZE; j++)
-   {
-       freeGroup_lock(a[j]);
-   }
-    
     // this time, sweep forwards allocating new blocks, and then
     // backwards deallocating them.
-    for (i=0; i < LOOPS; i++)
+    for (int i=0; i < LOOPS; i++)
     {
-        for (j=0; j < ARRSIZE; j++)
+        for (int j=0; j < ARRSIZE; j++)
         {
-            b = (rand() % MAXALLOC) + 1;
+            int b = (rand() % MAXALLOC) + 1;
             a[j] = allocGroup_lock(b);
             IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start));
             DEBUG_ONLY(checkFreeListSanity());
         }
-        for (j=ARRSIZE-1; j >= 0; j--)
+        for (int j=ARRSIZE-1; j >= 0; j--)
         {
             IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start));
             freeGroup_lock(a[j]);
             DEBUG_ONLY(checkFreeListSanity());
         }
     }
-    
+}
+
+static void test_aligned_alloc(void)
+{
+    bdescr *a[ARRSIZE];
+
+    // this time, sweep forwards allocating new blocks, and then
+    // backwards deallocating them.
+    for (int i=0; i < LOOPS; i++)
+    {
+        for (int j=0; j < ARRSIZE; j++)
+        {
+            // allocAlignedGroupOnNode does not support allocating more than
+            // BLOCKS_PER_MBLOCK/2 blocks.
+            int b = rand() % (BLOCKS_PER_MBLOCK / 2);
+            if (b == 0) { b = 1; }
+            a[j] = allocAlignedGroupOnNode(0, b);
+            if ((((W_)(a[j]->start)) % (b*BLOCK_SIZE)) != 0)
+            {
+                barf("%p is not aligned to allocation size %d", a[j], b);
+            }
+            IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start));
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+        for (int j=ARRSIZE-1; j >= 0; j--)
+        {
+            IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start));
+            freeGroup_lock(a[j]);
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+    }
+}
+
+int main (int argc, char *argv[])
+{
+    int i, j, b;
+
+    bdescr *a[ARRSIZE];
+
+    srand(SEED);
+
+    {
+        RtsConfig conf = defaultRtsConfig;
+        conf.rts_opts_enabled = RtsOptsAll;
+        hs_init_ghc(&argc, &argv, conf);
+    }
+
+    test_random_alloc();
+    test_sequential_alloc();
+    test_aligned_alloc();
+
     DEBUG_ONLY(checkFreeListSanity());
 
     hs_exit(); // will do a memory leak test