summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Pickering <matthewtpickering@gmail.com>2021-03-04 10:16:07 +0000
committerMatthew Pickering <matthewtpickering@gmail.com>2021-03-07 19:31:38 +0000
commit7d4473b1c0887b18ee6424ac97fbe3e7afa69f66 (patch)
tree93b802d3ed1324dfc1b0ec456034bbf9164f3584
parentcf65cf16c89414273c4f6b2d090d4b2fffb90759 (diff)
downloadhaskell-wip/ghc-allocate-pinned.tar.gz
rts: Use a separate free block list for allocatePinnedwip/ghc-allocate-pinned
The way in which allocatePinned took blocks out of the nursery was leading to horrible fragmentation in some workloads. The strategy now is that a separate free block list is reserved for each capability and blocks are taken from there. When it's empty the global SM lock is taken and a fresh block of size PINNED_EMPTY_SIZE is allocated. Fixes #19481
-rw-r--r--rts/Capability.c1
-rw-r--r--rts/Capability.h2
-rw-r--r--rts/sm/Sanity.c7
-rw-r--r--rts/sm/Storage.c162
-rw-r--r--testsuite/tests/rts/T19481.hs56
-rw-r--r--testsuite/tests/rts/all.T2
6 files changed, 215 insertions, 15 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index 136a62a71e..7a83821e00 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -314,6 +314,7 @@ initCapability (Capability *cap, uint32_t i)
cap->interrupt = 0;
cap->pinned_object_block = NULL;
cap->pinned_object_blocks = NULL;
+ cap->pinned_object_empty = NULL;
#if defined(PROFILING)
cap->r.rCCCS = CCS_SYSTEM;
diff --git a/rts/Capability.h b/rts/Capability.h
index df486829ea..14ba9ef2d7 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -93,6 +93,8 @@ struct Capability_ {
bdescr *pinned_object_block;
// full pinned object blocks allocated since the last GC
bdescr *pinned_object_blocks;
+ // empty pinned object blocks, to be allocated into
+ bdescr *pinned_object_empty;
// per-capability weak pointer list associated with nursery (older
// lists stored in generation object)
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 193a1a884c..b39559a653 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -1185,7 +1185,7 @@ memInventory (bool show)
{
uint32_t g, i;
W_ gen_blocks[RtsFlags.GcFlags.generations];
- W_ nursery_blocks = 0, retainer_blocks = 0,
+ W_ nursery_blocks = 0, free_pinned_blocks = 0, retainer_blocks = 0,
arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0,
upd_rem_set_blocks = 0;
W_ live_blocks = 0, free_blocks = 0;
@@ -1223,6 +1223,7 @@ memInventory (bool show)
nursery_blocks += capabilities[i]->pinned_object_block->blocks;
}
nursery_blocks += countBlocks(capabilities[i]->pinned_object_blocks);
+ free_pinned_blocks += countBlocks(capabilities[i]->pinned_object_empty);
}
#if defined(PROFILING)
@@ -1252,7 +1253,7 @@ memInventory (bool show)
}
live_blocks += nursery_blocks +
+ retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks
- + upd_rem_set_blocks;
+ + upd_rem_set_blocks + free_pinned_blocks;
#define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_)))
@@ -1271,6 +1272,8 @@ memInventory (bool show)
}
debugBelch(" nursery : %5" FMT_Word " blocks (%6.1lf MB)\n",
nursery_blocks, MB(nursery_blocks));
+ debugBelch(" empty pinned : %5" FMT_Word " blocks (%6.1lf MB)\n",
+ nursery_blocks, MB(free_pinned_blocks));
debugBelch(" retainer : %5" FMT_Word " blocks (%6.1lf MB)\n",
retainer_blocks, MB(retainer_blocks));
debugBelch(" arena blocks : %5" FMT_Word " blocks (%6.1lf MB)\n",
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 2bab2d6432..82e959e8d2 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -67,6 +67,16 @@ generation *oldest_gen = NULL; /* oldest generation, for convenience */
nursery *nurseries = NULL;
uint32_t n_nurseries;
+/* Pinned Nursery Size, the number of blocks that we reserve for
+ * pinned data. The number chosen here decides whether pinned objects
+ * are allocated from the free_list (if n < BLOCKS_PER_MBLOCK) or whether
+ * a fresh mblock is allocated each time.
+ * See Note [Sources of Block Level Fragmentation]
+ * */
+
+#define PINNED_EMPTY_SIZE BLOCKS_PER_MBLOCK
+
+
/*
* When we are using nursery chunks, we need a separate next_nursery
* pointer for each NUMA node.
@@ -353,6 +363,7 @@ void listAllBlocks (ListBlocksCb cb, void *user)
cb(user, capabilities[i]->pinned_object_block);
}
cb(user, capabilities[i]->pinned_object_blocks);
+ cb(user, capabilities[i]->pinned_object_empty);
}
}
@@ -1257,25 +1268,47 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig
// avoid that (benchmarks that allocate a lot of pinned
// objects scale really badly if we do this).
//
- // So first, we try taking the next block from the nursery, in
- // the same way as allocate().
- bd = cap->r.rCurrentNursery->link;
+ // See Note [Sources of Block Level Fragmentation]
+ // for a more complete history of this section.
+ bd = cap->pinned_object_empty;
if (bd == NULL) {
- // The nursery is empty: allocate a fresh block (we can't fail
+ // The pinned block list is empty: allocate a fresh block (we can't fail
// here).
ACQUIRE_SM_LOCK;
- bd = allocBlockOnNode(cap->node);
+ bd = allocNursery(cap->node, NULL, PINNED_EMPTY_SIZE);
RELEASE_SM_LOCK;
- initBdescr(bd, g0, g0);
- } else {
- newNurseryBlock(bd);
- // we have a block in the nursery: steal it
- cap->r.rCurrentNursery->link = bd->link;
- if (bd->link != NULL) {
- bd->link->u.back = cap->r.rCurrentNursery;
+ }
+
+ // Bump up the nursery pointer to avoid the pathological situation
+ // where a program is *only* allocating pinned objects.
+ // T4018 fails without this safety.
+ // This has the effect of counting a full pinned block in the same way
+ // as a full nursery block, so GCs will be triggered at the same interval
+ // if you are only allocating pinned data compared to normal allocations
+ // via allocate().
+ bdescr * nbd;
+ nbd = cap->r.rCurrentNursery->link;
+ if (nbd != NULL){
+ newNurseryBlock(nbd);
+ cap->r.rCurrentNursery->link = nbd->link;
+ if (nbd->link != NULL) {
+ nbd->link->u.back = cap->r.rCurrentNursery;
}
- cap->r.rNursery->n_blocks -= bd->blocks;
+ dbl_link_onto(nbd, &cap->r.rNursery->blocks);
+ // Important for accounting purposes
+ if (cap->r.rCurrentAlloc){
+ finishedNurseryBlock(cap, cap->r.rCurrentAlloc);
+ }
+ cap->r.rCurrentAlloc = nbd;
+ }
+
+
+ cap->pinned_object_empty = bd->link;
+ newNurseryBlock(bd);
+ if (bd->link != NULL) {
+ bd->link->u.back = cap->pinned_object_empty;
}
+ initBdescr(bd, g0, g0);
cap->pinned_object_block = bd;
bd->flags = BF_PINNED | BF_LARGE | BF_EVACUATED;
@@ -1912,3 +1945,106 @@ _bdescr (StgPtr p)
}
#endif
+
+/*
+Note [Sources of Block Level Fragmentation]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Block level fragmentation is when there is unused space in megablocks.
+The amount of fragmentation can be calculated as the difference between the
+total size of allocated blocks and the total size of allocated megablocks.
+
+The act of the copying collection naturally reduces fragmentation by moving
+data between megablocks. Over time, the effect is that most megablocks end up quite full because
+data will be copied out of fragmented megablocks. The new block is chosen from
+the free list where the aim is to choose a gap of approximately the right size for
+the copied block so the data will end up in a probably less fragmented block.
+There are two situations where we end up with block fragmentation.
+
+1. Fragmentation from pinned data
+2. Fragmentation from nursery allocated blocks
+
+# Pinned Data Fragmentation
+
+There are two sources of
+pinned data, large objects and pinned bytearrays. After one of these object
+types is allocated, it is never moved by the collector
+and therefore if all the other blocks are collected around it then you can end
+up with a megablock with one pinned block and no other blocks. No special
+effort is taken in the compiler
+to ensure that this kind of fragmentation doesn't happen in the first place and
+once the heap is fragmented in this way, there's nothing you can do about it
+beyond hoping that the pinned data is eventually freed.
+
+# Nursery Fragmentation
+
+The other reason that a block may not ever be moved or emptied is if it forms
+part of the nursery. When the nursery is first allocated then it is made up of
+megablock sized chunks, so if the nursery is 4 megabytes then it will consist of
+blocks from about 4 megablocks.
+
+Over time, the nursery is resized (by resizeNurseries) under various conditions.
+It gets bigger when
+we are allocating more and then smaller when we are allocating less.
+When the nursery is resized
+blocks are added or removed to it at potentially smaller sizes than a complete
+megablock. For example, if the nursery size needs to increase by 1, then
+the free list is consulted for a block of size 1 (from a random block)
+and that's added to the nursery.
+
+Over time the make-up of the nursery changes from 4
+contiguous megablocks to a hodge-podge of blocks from different megablocks. In
+some programs (see #19481), the fragmentation is so bad that a program with
+only 4 MB of live data can retain over 500 megablocks because each of these
+megablocks contributed a small number of blocks to the nursery.
+
+In particular, and confusingly, this second form of fragmentation was caused
+by the act of allocating pinned objects. `allocPinned` was the primary
+reason that the nursery size decreases by small amounts. When `allocPinned`
+needed a block then it took a block permanently out of
+the nursery which shrunk the size of the nursery by 1 block. Then next time the size
+of the nursery was checked, the `alloc_nurseries` found that the existing
+nursery was smaller than the desired size and a new blocked needed
+to be added. This allocation was serviced from an arbitrary megablock
+which had some free space. The effect over time as more allocation happened
+was the nursery became made up of blocks from many different megablocks.
+
+Instead now we maintain a separate small list of blocks in `pinned_object_empty`
+which fresh blocks are taken from when we need a new one for a pinned block rather
+than threatening the continuity of the nursery. The size of this list is controlled
+by the PINNED_EMPTY_SIZE macro.
+
+In theory, this kind of fragmentation due to the nursery could still happen
+but in practice removing the primary cause (allocatePinned) was sufficient to
+greatly improve the situation. Another way to "fix" fragmentation of the nursery
+would be to periodically reallocate it when it was fragmented across many megablocks.
+
+Ticket: #19481
+
+# When can fragmentation be observed?
+
+Fragmentation is observed when the live data in a program is low compared to
+the overall resident size of the heap. The block allocator can reuse unused
+space within a megablock and therefore as residency
+increases again, the fragmented blocks will get filled up. Having a block-level
+fragmented heap means your program will never go below a certain memory
+threshold but it doesn't "use" more memory during periods of high residency.
+To clarify, say you observe 100 MB of fragmentation when your live data is
+4 MB, if your live data rise to 200MB then you probably will not still observe 100 MB
+of fragmentation as the block allocate will use the space in fragmented megablocks.
+
+# How to observe fragmentation
+
+Your heap is probably fragmented when
+
+* Live bytes is low
+* Memory in use (number of megablocks) is comparatively high
+* The size of the free list dominates residency (this can be observed using the
+ debug RTS and the memory inventory produced by -Dg).
+
+# Compacting Collector
+
+The compacting collector does nothing to improve megablock
+level fragmentation. The role of the compacting GC is to remove object level
+fragmentation and to use less memory when collecting. - see #19248
+*/
diff --git a/testsuite/tests/rts/T19481.hs b/testsuite/tests/rts/T19481.hs
new file mode 100644
index 0000000000..bd3ed6895f
--- /dev/null
+++ b/testsuite/tests/rts/T19481.hs
@@ -0,0 +1,56 @@
+{-# LANGUAGE UnboxedTuples #-}
+{-# LANGUAGE MagicHash #-}
+module Main where
+
+import GHC.Exts
+import GHC.IO
+import GHC.Stats
+import System.Mem
+import Control.Monad
+
+data BA = BA ByteArray#
+
+-- TODO: This shouldn't be hardcoded but MBLOCK_SIZE isn't exported by
+-- any RTS header I could find.
+mblockSize = 2 ^ 20
+
+main = do
+ -- Increasing this number increases the amount of fragmentation (but not
+ -- linearly)
+ ba <- replicateM 500 one
+ replicateM 100 performMajorGC
+ s <- getRTSStats
+ let mblocks = (gcdetails_mem_in_use_bytes (gc s) `div` mblockSize)
+ if mblocks < 15
+ then return ()
+ else error ("Heap is fragmented: " ++ show mblocks)
+ return ()
+
+one = do
+ ba <- mkBlock
+ bs <- mapM isP ba
+ return ()
+
+
+isP (BA ba) = IO $ \s0 -> (# s0, isTrue# (isByteArrayPinned# ba) #)
+
+mkN 0 = return []
+mkN k = (:) <$> mkBA <*> mkN (k - 1)
+
+-- Mixture of pinned and unpinned allocation so that allocatePinned takes
+-- some pinned blocks from the nursery.
+mkBlock = (++) <$> replicateM 100 mkBAP <*> replicateM 10000 mkBA
+
+mkBAP =
+ IO $ \s0 ->
+ -- 1024 is below large object threshold but fills up a block quickly
+ case newPinnedByteArray# 1024# s0 of
+ (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of
+ (# s2, ba #) -> (# s2, BA ba #)
+
+mkBA =
+ IO $ \s0 ->
+ -- 1024 is below large object threshold but fills up a block quickly
+ case newByteArray# 1024# s0 of
+ (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of
+ (# s2, ba #) -> (# s2, BA ba #)
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index e74834d2a1..9f2a54cd0f 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -423,3 +423,5 @@ test('T17088',
compile_and_run, ['-rtsopts -O2'])
test('T15427', normal, compile_and_run, [''])
+
+test('T19481', extra_run_opts('+RTS -T -RTS'), compile_and_run, [''])