5 files changed, 60 insertions, 3 deletions
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 41510e7f8e..e774846dbb 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -492,6 +492,24 @@ Mutex concurrent_coll_finished_lock;
  * remembered set during the preparatory GC. This allows us to safely skip the
  * non-moving write barrier without jeopardizing the snapshot invariant.
  *
+ *
+ * Note [Allocating pinned objects into the non-moving heap]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Under the moving collector small, pinned ByteArray#s are allocated by
+ * Storage.c:allocatePinned() into a per-capability accumulator block which is
+ * filled in a bump-pointer fashion. While this scheme is simple, it can lead
+ * to very poor fragmentation behavior as objects become unreachable: a single
+ * live ByteArray# can keep an entire block of memory alive.
+ *
+ * When the non-moving collector is in use we can do better by allocating small
+ * pinned objects directly into the non-moving heap.
+ *
+ * One wrinkle here is that pinned ByteArrays may have alignment requirements
+ * which requires that we insert padding zero-words before the beginning of the
+ * object. We must be certain to account for this padding when inspecting the
+ * object.
+ *
  */
 
 memcount nonmoving_live_words = 0;
@@ -660,8 +678,8 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
     unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
     unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
 
-    // The max we ever allocate is 3276 bytes (anything larger is a large
-    // object and not moved) which is covered by allocator 9.
+    // The max we ever allocate is NONMOVING_MAX_BLOCK_SZ bytes (anything
+    // larger is a large object and not moved) which is covered by allocator 9.
     ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
 
     struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0];
diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index 12fb9ddaab..0f7860f44c 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -92,11 +92,17 @@ struct NonmovingAllocator {
 
 // allocators cover block sizes of 2^NONMOVING_ALLOCA0 to
 // 2^(NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT) (in bytes)
+// The largest allocator class must be at least LARGE_OBJECT_THRESHOLD in size
+// as Storage.c:allocatePinned will allocate small pinned allocations into the
+// non-moving heap.
 #define NONMOVING_ALLOCA_CNT 12
 
 // maximum number of free segments to hold on to
 #define NONMOVING_MAX_FREE 16
 
+// block size of largest allocator in bytes.
+#define NONMOVING_MAX_BLOCK_SZ (1 << (NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT - 1))
+
 struct NonmovingHeap {
     struct NonmovingAllocator *allocators[NONMOVING_ALLOCA_CNT];
     // free segment list. This is a cache where we keep up to
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index d9758b943f..c5c88bba43 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -1380,6 +1380,11 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
     // Trace pointers
     /////////////////////////////////////////////////////
 
+    // Find beginning of object.
+    // See Note [Allocating pinned objects into the non-moving heap].
+    while (*(StgPtr*) p == NULL)
+        p = (StgClosure *) ((StgPtr*) p + 1);
+
     const StgInfoTable *info = get_itbl(p);
     switch (info->type) {
 
diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c
index 9f92563032..8128f0dba3 100644
--- a/rts/sm/NonMovingScav.c
+++ b/rts/sm/NonMovingScav.c
@@ -84,9 +84,18 @@
  */
 
 void
-nonmovingScavengeOne (StgClosure *q)
+nonmovingScavengeOne (StgClosure *q0)
 {
+    StgClosure *q = q0;
+
+    // N.B. There may be a gap before the first word of the closure in the case
+    // of an aligned ByteArray# as allocated by allocatePinned().
+    // See Note [Allocating pinned objects into the non-moving heap].
+    while (*(StgPtr*) q == NULL)
+        q = (StgClosure *) ((StgPtr*) q + 1);
+
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+
     StgPtr p = (StgPtr)q;
     const StgInfoTable *info = get_itbl(q);
     const bool saved_eager_promotion = gct->eager_promotion;
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 40353ea180..a05e43721e 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -1248,6 +1248,25 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig
 
     const StgWord alignment_w = alignment / sizeof(W_);
 
+    // If the non-moving collector is enabled then we can allocate small,
+    // pinned allocations directly into the non-moving heap. This is a bit more
+    // expensive up-front but reduces fragmentation and is worthwhile since
+    // pinned allocations are often long-lived..
+    //
+    // See Note [Allocating pinned objects into the non-moving heap].
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)
+        && (n + alignment_w) * sizeof(W_) < NONMOVING_MAX_BLOCK_SZ)
+    {
+        ACQUIRE_SM_LOCK;
+        p = nonmovingAllocate(cap, n + alignment_w);
+        RELEASE_SM_LOCK;
+        W_ off_w = ALIGN_WITH_OFF_W(p, alignment, align_off);
+        MEMSET_SLOP_W(p, 0, off_w);
+        p += off_w;
+        MEMSET_SLOP_W(p + n, 0, alignment_w - off_w - 1);
+        return p;
+    }
+
     // If the request is for a large object, then allocate()
     // will give us a pinned object anyway.
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {