8 files changed, 180 insertions, 25 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index 33a94398cd..8b552e0b09 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -748,6 +748,8 @@ static Capability * waitForReturnCapability (Task *task)
  * result of the external call back to the Haskell thread that
  * made it.
  *
+ * pCap is strictly an output.
+ *
  * ------------------------------------------------------------------------- */
 
 void waitForCapability (Capability **pCap, Task *task)
@@ -840,6 +842,9 @@ void waitForCapability (Capability **pCap, Task *task)
  *      SYNC_GC_PAR), either to do a sequential GC, forkProcess, or
  *      setNumCapabilities.  We should give up the Capability temporarily.
  *
+ * When yieldCapability returns *pCap will have been updated to the new
+ * capability held by the caller.
+ *
  * ------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index d06cde05d9..a2ab3de586 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -1744,7 +1744,7 @@ loop:
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
-    if (TO_W_(StgStack_dirty(stack)) == 0) {
+    if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) {
         ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
 
@@ -1829,7 +1829,7 @@ loop:
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
-    if (TO_W_(StgStack_dirty(stack)) == 0) {
+    if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) {
         ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
 
diff --git a/rts/Threads.c b/rts/Threads.c
index 2bdcea1c00..3d5b463051 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -85,7 +85,7 @@ createThread(Capability *cap, W_ size)
     SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
     stack->stack_size   = stack_size - sizeofW(StgStack);
     stack->sp           = stack->stack + stack->stack_size;
-    stack->dirty        = 1;
+    stack->dirty        = STACK_DIRTY;
 
     tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
     TICK_ALLOC_TSO();
@@ -804,7 +804,7 @@ loop:
     // indicate that the MVar operation has now completed.
     tso->_link = (StgTSO*)&stg_END_TSO_QUEUE_closure;
 
-    if (stack->dirty == 0) {
+    if ((stack->dirty & STACK_DIRTY) == 0) {
         dirty_STACK(cap, stack);
     }
 
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index f9e3d11407..b3e1e2ce75 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -310,7 +310,7 @@ setup_tail (bdescr *bd)
 // Take a free block group bd, and split off a group of size n from
 // it.  Adjust the free list as necessary, and return the new group.
 static bdescr *
-split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
+split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */)
 {
     bdescr *fg; // free group
 
@@ -325,6 +325,46 @@ split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
     return fg;
 }
 
+// Take N blocks off the end, free the rest.
+static bdescr *
+split_block_high (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* ret = bd + bd->blocks - n; // take n blocks off the end
+    ret->blocks = n;
+    ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W;
+    ret->link = NULL;
+
+    bd->blocks -= n;
+
+    setup_tail(ret);
+    setup_tail(bd);
+    freeGroup(bd);
+
+    return ret;
+}
+
+// Like `split_block_high`, but takes n blocks off the beginning rather
+// than the end.
+static bdescr *
+split_block_low (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* bd_ = bd + n;
+    bd_->blocks = bd->blocks - n;
+    bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W;
+
+    bd->blocks = n;
+
+    setup_tail(bd_);
+    setup_tail(bd);
+    freeGroup(bd_);
+
+    return bd;
+}
+
 /* Only initializes the start pointers on the first megablock and the
  * blocks field of the first bdescr; callers are responsible for calling
  * initGroup afterwards.
@@ -461,6 +501,108 @@ finish:
     return bd;
 }
 
+// Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will
+// be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used
+// for easily accessing the beginning of the group from a location p in the
+// group with
+//
+//     p % (BLOCK_SIZE*n)
+//
+// Used by the non-moving collector for allocating segments.
+//
+// Because the storage manager does not support aligned allocations, we have to
+// allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned
+// region in the allocated blocks. After finding the aligned area we want to
+// free slop on the low and high sides, and block allocator doesn't support
+// freeing only some portion of a megablock (we can only free whole megablocks).
+// So we disallow allocating megablocks here, and allow allocating at most
+// `BLOCKS_PER_MBLOCK / 2` blocks.
+bdescr *
+allocAlignedGroupOnNode (uint32_t node, W_ n)
+{
+    // allocate enough blocks to have enough space aligned at n-block boundary
+    // free any slops on the low and high side of this space
+
+    // number of blocks to allocate to make sure we have enough aligned space
+    W_ num_blocks = 2*n - 1;
+
+    if (num_blocks >= BLOCKS_PER_MBLOCK) {
+        barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n"
+             "    requested blocks: %" FMT_Word "\n"
+             "    required for alignment: %" FMT_Word "\n"
+             "    megablock size (in blocks): %" FMT_Word,
+             n, num_blocks, (W_) BLOCKS_PER_MBLOCK);
+    }
+
+    W_ group_size = n * BLOCK_SIZE;
+
+    // To reduce splitting and fragmentation we use allocLargeChunkOnNode here.
+    // Tweak the max allocation to avoid allocating megablocks. Splitting slop
+    // below doesn't work with megablocks (freeGroup can't free only a portion
+    // of a megablock so we can't allocate megablocks and free some parts of
+    // them).
+    W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1);
+    bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks);
+    // We may allocate more than num_blocks, so update it
+    num_blocks = bd->blocks;
+
+    // slop on the low side
+    W_ slop_low = 0;
+    if ((uintptr_t)bd->start % group_size != 0) {
+        slop_low = group_size - ((uintptr_t)bd->start % group_size);
+    }
+
+    W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low;
+
+    ASSERT((slop_low % BLOCK_SIZE) == 0);
+    ASSERT((slop_high % BLOCK_SIZE) == 0);
+
+    W_ slop_low_blocks = slop_low / BLOCK_SIZE;
+    W_ slop_high_blocks = slop_high / BLOCK_SIZE;
+
+    ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks);
+
+#if defined(DEBUG)
+    checkFreeListSanity();
+    W_ free_before = countFreeList();
+#endif
+
+    if (slop_low_blocks != 0) {
+        bd = split_block_high(bd, num_blocks - slop_low_blocks);
+        ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_low_blocks);
+    checkFreeListSanity();
+#endif
+
+    // At this point the bd should be aligned, but we may have slop on the high side
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+#if defined(DEBUG)
+    free_before = countFreeList();
+#endif
+
+    if (slop_high_blocks != 0) {
+        bd = split_block_low(bd, n);
+        ASSERT(bd->blocks == n);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_high_blocks);
+    checkFreeListSanity();
+#endif
+
+    // Should still be aligned
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+    // Just to make sure I get this right
+    ASSERT(Bdescr(bd->start) == bd);
+
+    return bd;
+}
+
 STATIC_INLINE
 uint32_t nodeWithLeastBlocks (void)
 {
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 53a473d26c..666daf0e32 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -39,7 +39,19 @@
         copy_tag(p, info, src, size, stp, tag)
 #endif
 
-/* Used to avoid long recursion due to selector thunks
+/* Note [Selector optimisation depth limit]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * MAX_THUNK_SELECTOR_DEPTH is used to avoid long recursion of
+ * eval_thunk_selector due to nested selector thunks. Note that this *only*
+ * counts nested selector thunks, e.g. `fst (fst (... (fst x)))`. The collector
+ * will traverse interleaved selector-constructor pairs without limit, e.g.
+ *
+ *     a = (fst b, _)
+ *     b = (fst c, _)
+ *     c = (fst d, _)
+ *     d = (x, _)
+ *
  */
 #define MAX_THUNK_SELECTOR_DEPTH 16
 
@@ -1257,6 +1269,7 @@ selector_loop:
 
           // recursively evaluate this selector.  We don't want to
           // recurse indefinitely, so we impose a depth bound.
+          // See Note [Selector optimisation depth limit].
           if (gct->thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) {
               goto bale_out;
           }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 76237f35c2..3f301aeb53 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -1841,21 +1841,16 @@ resize_nursery (void)
 
 #if defined(DEBUG)
 
-static void gcCAFs(void)
+void gcCAFs(void)
 {
-    StgIndStatic *p, *prev;
+    uint32_t i = 0;
+    StgIndStatic *prev = NULL;
 
-    const StgInfoTable *info;
-    uint32_t i;
-
-    i = 0;
-    p = debug_caf_list;
-    prev = NULL;
-
-    for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
-         p = (StgIndStatic*)p->saved_info) {
-
-        info = get_itbl((StgClosure*)p);
+    for (StgIndStatic *p = debug_caf_list;
+         p != (StgIndStatic*) END_OF_CAF_LIST;
+         p = (StgIndStatic*) p->saved_info)
+    {
+        const StgInfoTable *info = get_itbl((StgClosure*)p);
         ASSERT(info->type == IND_STATIC);
 
         // See Note [STATIC_LINK fields] in Storage.h
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 3585bd93b4..289ac542b8 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -632,9 +632,9 @@ checkGlobalTSOList (bool checkTSOs)
 
               stack = tso->stackobj;
               while (1) {
-                  if (stack->dirty & 1) {
-                      ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & TSO_MARKED));
-                      stack->dirty &= ~TSO_MARKED;
+                  if (stack->dirty & STACK_DIRTY) {
+                      ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & STACK_SANE));
+                      stack->dirty &= ~STACK_SANE;
                   }
                   frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size
                                                 - sizeofW(StgUnderflowFrame));
@@ -669,7 +669,7 @@ checkMutableList( bdescr *mut_bd, uint32_t gen )
                 ((StgTSO *)p)->flags |= TSO_MARKED;
                 break;
             case STACK:
-                ((StgStack *)p)->dirty |= TSO_MARKED;
+                ((StgStack *)p)->dirty |= STACK_SANE;
                 break;
             }
         }
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 0130a08f7c..97c71478ed 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -1139,8 +1139,8 @@ dirty_TSO (Capability *cap, StgTSO *tso)
 void
 dirty_STACK (Capability *cap, StgStack *stack)
 {
-    if (stack->dirty == 0) {
-        stack->dirty = 1;
+    if (! (stack->dirty & STACK_DIRTY)) {
+        stack->dirty = STACK_DIRTY;
         recordClosureMutated(cap,(StgClosure*)stack);
     }
 }