diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/sm | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/sm')
-rw-r--r-- | rts/sm/BlockAlloc.c | 350 | ||||
-rw-r--r-- | rts/sm/BlockAlloc.h | 1 | ||||
-rw-r--r-- | rts/sm/GC.c | 8 | ||||
-rw-r--r-- | rts/sm/GCUtils.c | 15 | ||||
-rw-r--r-- | rts/sm/GCUtils.h | 14 | ||||
-rw-r--r-- | rts/sm/MBlock.c | 19 | ||||
-rw-r--r-- | rts/sm/MarkStack.h | 1 | ||||
-rw-r--r-- | rts/sm/OSMem.h | 4 | ||||
-rw-r--r-- | rts/sm/Storage.c | 232 |
9 files changed, 398 insertions, 246 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index ff1a6460a4..c2859b0c15 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -7,11 +7,11 @@ * This is the architecture independent part of the block allocator. * It requires only the following support from the operating system: * - * void *getMBlock(uint32_t n); + * void *getMBlocks(uint32_t n); * * returns the address of an n*MBLOCK_SIZE region of memory, aligned on * an MBLOCK_SIZE boundary. There are no other restrictions on the - * addresses of memory returned by getMBlock(). + * addresses of memory returned by getMBlocks(). * * ---------------------------------------------------------------------------*/ @@ -25,7 +25,7 @@ #include <string.h> -static void initMBlock(void *mblock); +static void initMBlock(void *mblock, uint32_t node); /* ----------------------------------------------------------------------------- @@ -158,28 +158,55 @@ static void initMBlock(void *mblock); // In THREADED_RTS mode, the free list is protected by sm_mutex. -static bdescr *free_list[NUM_FREE_LISTS]; -static bdescr *free_mblock_list; +static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS]; +static bdescr *free_mblock_list[MAX_NUMA_NODES]; W_ n_alloc_blocks; // currently allocated blocks W_ hw_alloc_blocks; // high-water allocated blocks +W_ n_alloc_blocks_by_node[MAX_NUMA_NODES]; + /* ----------------------------------------------------------------------------- Initialisation -------------------------------------------------------------------------- */ void initBlockAllocator(void) { - uint32_t i; - for (i=0; i < NUM_FREE_LISTS; i++) { - free_list[i] = NULL; + uint32_t i, node; + for (node = 0; node < MAX_NUMA_NODES; node++) { + for (i=0; i < NUM_FREE_LISTS; i++) { + free_list[node][i] = NULL; + } + free_mblock_list[node] = NULL; + n_alloc_blocks_by_node[node] = 0; } - free_mblock_list = NULL; n_alloc_blocks = 0; hw_alloc_blocks = 0; } /* ----------------------------------------------------------------------------- + Accounting + -------------------------------------------------------------------------- */ + +STATIC_INLINE +void recordAllocatedBlocks(uint32_t node, uint32_t n) +{ + n_alloc_blocks += n; + n_alloc_blocks_by_node[node] += n; + if (n > 0 && n_alloc_blocks > hw_alloc_blocks) { + hw_alloc_blocks = n_alloc_blocks; + } +} + +STATIC_INLINE +void recordFreedBlocks(uint32_t node, uint32_t n) +{ + ASSERT(n_alloc_blocks >= n); + n_alloc_blocks -= n; + n_alloc_blocks_by_node[node] -= n; +} + +/* ----------------------------------------------------------------------------- Allocation -------------------------------------------------------------------------- */ @@ -248,14 +275,14 @@ log_2_ceil(W_ n) } STATIC_INLINE void -free_list_insert (bdescr *bd) +free_list_insert (uint32_t node, bdescr *bd) { uint32_t ln; ASSERT(bd->blocks < BLOCKS_PER_MBLOCK); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); } @@ -284,18 +311,18 @@ setup_tail (bdescr *bd) // Take a free block group bd, and split off a group of size n from // it. Adjust the free list as necessary, and return the new group. static bdescr * -split_free_block (bdescr *bd, W_ n, uint32_t ln) +split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln) { bdescr *fg; // free group ASSERT(bd->blocks > n); - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); fg = bd + bd->blocks - n; // take n blocks off the end fg->blocks = n; bd->blocks -= n; setup_tail(bd); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); return fg; } @@ -304,7 +331,7 @@ split_free_block (bdescr *bd, W_ n, uint32_t ln) * initGroup afterwards. */ static bdescr * -alloc_mega_group (StgWord mblocks) +alloc_mega_group (uint32_t node, StgWord mblocks) { bdescr *best, *bd, *prev; StgWord n; @@ -313,14 +340,14 @@ alloc_mega_group (StgWord mblocks) best = NULL; prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) { if (bd->blocks == n) { if (prev) { prev->link = bd->link; } else { - free_mblock_list = bd->link; + free_mblock_list[node] = bd->link; } return bd; } @@ -341,12 +368,17 @@ alloc_mega_group (StgWord mblocks) (best_mblocks-mblocks)*MBLOCK_SIZE); best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks); - initMBlock(MBLOCK_ROUND_DOWN(bd)); + initMBlock(MBLOCK_ROUND_DOWN(bd), node); } else { - void *mblock = getMBlocks(mblocks); - initMBlock(mblock); // only need to init the 1st one + void *mblock; + if (RtsFlags.GcFlags.numa) { + mblock = getMBlocksOnNode(node, mblocks); + } else { + mblock = getMBlocks(mblocks); + } + initMBlock(mblock, node); // only need to init the 1st one bd = FIRST_BDESCR(mblock); } bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks); @@ -354,7 +386,7 @@ alloc_mega_group (StgWord mblocks) } bdescr * -allocGroup (W_ n) +allocGroupOnNode (uint32_t node, W_ n) { bdescr *bd, *rem; StgWord ln; @@ -369,21 +401,19 @@ allocGroup (W_ n) // n_alloc_blocks doesn't count the extra blocks we get in a // megablock group. - n_alloc_blocks += mblocks * BLOCKS_PER_MBLOCK; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); - bd = alloc_mega_group(mblocks); + bd = alloc_mega_group(node, mblocks); // only the bdescrs of the first MB are required to be initialised initGroup(bd); goto finish; } - n_alloc_blocks += n; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, n); ln = log_2_ceil(n); - while (ln < NUM_FREE_LISTS && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) { ln++; } @@ -397,27 +427,27 @@ allocGroup (W_ n) } #endif - bd = alloc_mega_group(1); + bd = alloc_mega_group(node,1); bd->blocks = n; initGroup(bd); // we know the group will fit rem = bd + n; rem->blocks = BLOCKS_PER_MBLOCK-n; - initGroup(rem); // init the slop - n_alloc_blocks += rem->blocks; + initGroup(rem); // init the slop + recordAllocatedBlocks(node,rem->blocks); freeGroup(rem); // add the slop on to the free list goto finish; } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks == n) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else if (bd->blocks > n) // block too big... { - bd = split_free_block(bd, n, ln); + bd = split_free_block(bd, node, n, ln); ASSERT(bd->blocks == n); initGroup(bd); } @@ -432,6 +462,26 @@ finish: return bd; } +STATIC_INLINE +uint32_t nodeWithLeastBlocks (void) +{ + uint32_t node = 0, i; + uint32_t min_blocks = n_alloc_blocks_by_node[0]; + for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) { + if (n_alloc_blocks_by_node[i] < min_blocks) { + min_blocks = n_alloc_blocks_by_node[i]; + node = i; + } + } + return node; +} + +bdescr* allocGroup (W_ n) +{ + return allocGroupOnNode(nodeWithLeastBlocks(),n); +} + + // // Allocate a chunk of blocks that is at least min and at most max // blocks in size. This API is used by the nursery allocator that @@ -448,8 +498,7 @@ finish: // fragmentation, but we make sure that we allocate large blocks // preferably if there are any. // -bdescr * -allocLargeChunk (W_ min, W_ max) +bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max) { bdescr *bd; StgWord ln, lnmax; @@ -461,34 +510,38 @@ allocLargeChunk (W_ min, W_ max) ln = log_2_ceil(min); lnmax = log_2_ceil(max); - while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) { ln++; } if (ln == NUM_FREE_LISTS || ln == lnmax) { - return allocGroup(max); + return allocGroupOnNode(node,max); } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks <= max) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else // block too big... { - bd = split_free_block(bd, max, ln); + bd = split_free_block(bd, node, max, ln); ASSERT(bd->blocks == max); initGroup(bd); } - n_alloc_blocks += bd->blocks; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, bd->blocks); IF_DEBUG(sanity, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE)); IF_DEBUG(sanity, checkFreeListSanity()); return bd; } +bdescr* allocLargeChunk (W_ min, W_ max) +{ + return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max); +} + bdescr * allocGroup_lock(W_ n) { @@ -500,17 +553,31 @@ allocGroup_lock(W_ n) } bdescr * -allocBlock(void) +allocBlock_lock(void) { - return allocGroup(1); + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlock(); + RELEASE_SM_LOCK; + return bd; } bdescr * -allocBlock_lock(void) +allocGroupOnNode_lock(uint32_t node, W_ n) { bdescr *bd; ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocGroupOnNode(node,n); + RELEASE_SM_LOCK; + return bd; +} + +bdescr * +allocBlockOnNode_lock(uint32_t node) +{ + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlockOnNode(node); RELEASE_SM_LOCK; return bd; } @@ -542,11 +609,13 @@ static void free_mega_group (bdescr *mg) { bdescr *bd, *prev; + uint32_t node; // Find the right place in the free list. free_mblock_list is // sorted by *address*, not by size as the free_list is. prev = NULL; - bd = free_mblock_list; + node = mg->node; + bd = free_mblock_list[node]; while (bd && bd->start < mg->start) { prev = bd; bd = bd->link; @@ -561,8 +630,8 @@ free_mega_group (bdescr *mg) } else { - mg->link = free_mblock_list; - free_mblock_list = mg; + mg->link = free_mblock_list[node]; + free_mblock_list[node] = mg; } // coalesce forwards coalesce_mblocks(mg); @@ -575,12 +644,15 @@ void freeGroup(bdescr *p) { StgWord ln; + uint32_t node; - // Todo: not true in multithreaded GC + // not true in multithreaded GC: // ASSERT_SM_LOCK(); ASSERT(p->free != (P_)-1); + node = p->node; + p->free = (void *)-1; /* indicates that this block is free */ p->gen = NULL; p->gen_no = 0; @@ -597,14 +669,13 @@ freeGroup(bdescr *p) // If this is an mgroup, make sure it has the right number of blocks ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks)); - n_alloc_blocks -= mblocks * BLOCKS_PER_MBLOCK; + recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); free_mega_group(p); return; } - ASSERT(n_alloc_blocks >= p->blocks); - n_alloc_blocks -= p->blocks; + recordFreedBlocks(node, p->blocks); // coalesce forwards { @@ -614,7 +685,7 @@ freeGroup(bdescr *p) { p->blocks += next->blocks; ln = log_2(next->blocks); - dbl_link_remove(next, &free_list[ln]); + dbl_link_remove(next, &free_list[node][ln]); if (p->blocks == BLOCKS_PER_MBLOCK) { free_mega_group(p); @@ -634,7 +705,7 @@ freeGroup(bdescr *p) if (prev->free == (P_)-1) { ln = log_2(prev->blocks); - dbl_link_remove(prev, &free_list[ln]); + dbl_link_remove(prev, &free_list[node][ln]); prev->blocks += p->blocks; if (prev->blocks >= BLOCKS_PER_MBLOCK) { @@ -646,7 +717,7 @@ freeGroup(bdescr *p) } setup_tail(p); - free_list_insert(p); + free_list_insert(node,p); IF_DEBUG(sanity, checkFreeListSanity()); } @@ -679,7 +750,7 @@ freeChain_lock(bdescr *bd) } static void -initMBlock(void *mblock) +initMBlock(void *mblock, uint32_t node) { bdescr *bd; StgWord8 *block; @@ -695,6 +766,7 @@ initMBlock(void *mblock) for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1, block += BLOCK_SIZE) { bd->start = (void*)block; + bd->node = node; } } @@ -734,28 +806,32 @@ countAllocdBlocks(bdescr *bd) void returnMemoryToOS(uint32_t n /* megablocks */) { - static bdescr *bd; + bdescr *bd; + uint32_t node; StgWord size; - bd = free_mblock_list; - while ((n > 0) && (bd != NULL)) { - size = BLOCKS_TO_MBLOCKS(bd->blocks); - if (size > n) { - StgWord newSize = size - n; - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - freeAddr += newSize * MBLOCK_SIZE; - bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); - freeMBlocks(freeAddr, n); - n = 0; - } - else { - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - n -= size; - bd = bd->link; - freeMBlocks(freeAddr, size); + // ToDo: not fair, we free all the memory starting with node 0. + for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) { + bd = free_mblock_list[node]; + while ((n > 0) && (bd != NULL)) { + size = BLOCKS_TO_MBLOCKS(bd->blocks); + if (size > n) { + StgWord newSize = size - n; + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + freeAddr += newSize * MBLOCK_SIZE; + bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); + freeMBlocks(freeAddr, n); + n = 0; + } + else { + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + n -= size; + bd = bd->link; + freeMBlocks(freeAddr, size); + } } + free_mblock_list[node] = bd; } - free_mblock_list = bd; // Ask the OS to release any address space portion // that was associated with the just released MBlocks @@ -797,68 +873,71 @@ checkFreeListSanity(void) { bdescr *bd, *prev; StgWord ln, min; + uint32_t node; - - min = 1; - for (ln = 0; ln < NUM_FREE_LISTS; ln++) { - IF_DEBUG(block_alloc, - debugBelch("free block list [%" FMT_Word "]:\n", ln)); - - prev = NULL; - for (bd = free_list[ln]; bd != NULL; prev = bd, bd = bd->link) - { + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + min = 1; + for (ln = 0; ln < NUM_FREE_LISTS; ln++) { IF_DEBUG(block_alloc, - debugBelch("group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); - ASSERT(bd->free == (P_)-1); - ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); - ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); - ASSERT(bd->link != bd); // catch easy loops - - check_tail(bd); - - if (prev) - ASSERT(bd->u.back == prev); - else - ASSERT(bd->u.back == NULL); + debugBelch("free block list [%" FMT_Word "]:\n", ln)); + prev = NULL; + for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link) { - bdescr *next; - next = bd + bd->blocks; - if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + IF_DEBUG(block_alloc, + debugBelch("group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); + ASSERT(bd->free == (P_)-1); + ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); + ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); + ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->node == node); + + check_tail(bd); + + if (prev) + ASSERT(bd->u.back == prev); + else + ASSERT(bd->u.back == NULL); + { - ASSERT(next->free != (P_)-1); + bdescr *next; + next = bd + bd->blocks; + if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + { + ASSERT(next->free != (P_)-1); + } } } + min = min << 1; } - min = min << 1; - } - prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) - { - IF_DEBUG(block_alloc, - debugBelch("mega group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); + prev = NULL; + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) + { + IF_DEBUG(block_alloc, + debugBelch("mega group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); - ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->link != bd); // catch easy loops - if (bd->link != NULL) - { - // make sure the list is sorted - ASSERT(bd->start < bd->link->start); - } + if (bd->link != NULL) + { + // make sure the list is sorted + ASSERT(bd->start < bd->link->start); + } - ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); - ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) - == bd->blocks); + ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); + ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) + == bd->blocks); - // make sure we're fully coalesced - if (bd->link != NULL) - { - ASSERT(MBLOCK_ROUND_DOWN(bd->link) != - (StgWord8*)MBLOCK_ROUND_DOWN(bd) + - BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + // make sure we're fully coalesced + if (bd->link != NULL) + { + ASSERT(MBLOCK_ROUND_DOWN(bd->link) != + (StgWord8*)MBLOCK_ROUND_DOWN(bd) + + BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + } } } } @@ -869,18 +948,21 @@ countFreeList(void) bdescr *bd; W_ total_blocks = 0; StgWord ln; + uint32_t node; - for (ln=0; ln < NUM_FREE_LISTS; ln++) { - for (bd = free_list[ln]; bd != NULL; bd = bd->link) { - total_blocks += bd->blocks; + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (ln=0; ln < NUM_FREE_LISTS; ln++) { + for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) { + total_blocks += bd->blocks; + } + } + for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) { + total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); + // The caller of this function, memInventory(), expects to match + // the total number of blocks in the system against mblocks * + // BLOCKS_PER_MBLOCK, so we must subtract the space for the + // block descriptors from *every* mblock. } - } - for (bd = free_mblock_list; bd != NULL; bd = bd->link) { - total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); - // The caller of this function, memInventory(), expects to match - // the total number of blocks in the system against mblocks * - // BLOCKS_PER_MBLOCK, so we must subtract the space for the - // block descriptors from *every* mblock. } return total_blocks; } diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h index 2ba7c02c08..c26ae104e1 100644 --- a/rts/sm/BlockAlloc.h +++ b/rts/sm/BlockAlloc.h @@ -12,6 +12,7 @@ #include "BeginPrivate.h" bdescr *allocLargeChunk (W_ min, W_ max); +bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max); /* Debugging -------------------------------------------------------------- */ diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 996ce8cbce..3bfdaa25ff 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -802,7 +802,8 @@ new_gc_thread (uint32_t n, gc_thread *t) // but can't, because it uses gct which isn't set up at this point. // Hence, allocate a block for todo_bd manually: { - bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet + bdescr *bd = allocBlockOnNode(capNoToNumaNode(n)); + // no lock, locks aren't initialised yet initBdescr(bd, ws->gen, ws->gen->to); bd->flags = BF_EVACUATED; bd->u.scan = bd->free = bd->start; @@ -1182,7 +1183,8 @@ prepare_collected_gen (generation *gen) if (g != 0) { for (i = 0; i < n_capabilities; i++) { freeChain(capabilities[i]->mut_lists[g]); - capabilities[i]->mut_lists[g] = allocBlock(); + capabilities[i]->mut_lists[g] = + allocBlockOnNode(capNoToNumaNode(i)); } } @@ -1296,7 +1298,7 @@ static void stash_mut_list (Capability *cap, uint32_t gen_no) { cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; - cap->mut_lists[gen_no] = allocBlock_sync(); + cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); } /* ---------------------------------------------------------------------------- diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c index 5edf9dedbc..a515665d07 100644 --- a/rts/sm/GCUtils.c +++ b/rts/sm/GCUtils.c @@ -30,34 +30,33 @@ SpinLock gc_alloc_block_sync; #endif -bdescr * -allocBlock_sync(void) +bdescr* allocGroup_sync(uint32_t n) { bdescr *bd; + uint32_t node = capNoToNumaNode(gct->thread_index); ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocBlock(); + bd = allocGroupOnNode(node,n); RELEASE_SPIN_LOCK(&gc_alloc_block_sync); return bd; } -static bdescr * -allocGroup_sync(uint32_t n) +bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n) { bdescr *bd; ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocGroup(n); + bd = allocGroupOnNode(node,n); RELEASE_SPIN_LOCK(&gc_alloc_block_sync); return bd; } - static uint32_t allocBlocks_sync(uint32_t n, bdescr **hd) { bdescr *bd; uint32_t i; + uint32_t node = capNoToNumaNode(gct->thread_index); ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocLargeChunk(1,n); + bd = allocLargeChunkOnNode(node,1,n); // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a // fragmentation-friendly way. n = bd->blocks; diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h index 0f87eee3f1..7e5a827ce0 100644 --- a/rts/sm/GCUtils.h +++ b/rts/sm/GCUtils.h @@ -18,7 +18,19 @@ #include "GCTDecl.h" -bdescr *allocBlock_sync(void); +bdescr* allocGroup_sync(uint32_t n); +bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n); + +INLINE_HEADER bdescr *allocBlock_sync(void) +{ + return allocGroup_sync(1); +} + +INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node) +{ + return allocGroupOnNode_sync(node,1); +} + void freeChain_sync(bdescr *bd); void push_scanned_block (bdescr *bd, gen_workspace *ws); diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c index 440b03efa7..53999d2c4b 100644 --- a/rts/sm/MBlock.c +++ b/rts/sm/MBlock.c @@ -566,7 +566,7 @@ void releaseFreeMemory(void) void * getMBlock(void) { - return getMBlocks(1); + return getMBlocks(1); } // The external interface: allocate 'n' mblocks, and return the @@ -587,6 +587,23 @@ getMBlocks(uint32_t n) return ret; } +void * +getMBlocksOnNode(uint32_t node, uint32_t n) +{ + void *addr = getMBlocks(n); +#ifdef DEBUG + if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA +#endif + osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]); + return addr; +} + +void * +getMBlockOnNode(uint32_t node) +{ + return getMBlocksOnNode(node, 1); +} + void freeMBlocks(void *addr, uint32_t n) { diff --git a/rts/sm/MarkStack.h b/rts/sm/MarkStack.h index f978a32563..d90b5e47b4 100644 --- a/rts/sm/MarkStack.h +++ b/rts/sm/MarkStack.h @@ -15,6 +15,7 @@ #define SM_MARKSTACK_H #include "BeginPrivate.h" +#include "GCUtils.h" INLINE_HEADER void push_mark_stack(StgPtr p) diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h index 8518f05d1b..660942827d 100644 --- a/rts/sm/OSMem.h +++ b/rts/sm/OSMem.h @@ -19,6 +19,10 @@ void osFreeAllMBlocks(void); size_t getPageSize (void); StgWord64 getPhysicalMemorySize (void); void setExecutable (void *p, W_ len, rtsBool exec); +rtsBool osNumaAvailable(void); +uint32_t osNumaNodes(void); +StgWord osNumaMask(void); +void osBindMBlocksToNode(void *addr, StgWord size, uint32_t node); INLINE_HEADER size_t roundDownToPage (size_t x) diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 717c96ae81..a9a7857d43 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -6,7 +6,7 @@ * * Documentation on the architecture of the Storage Manager can be * found in the online commentary: - * + * * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage * * ---------------------------------------------------------------------------*/ @@ -37,7 +37,7 @@ #include "ffi.h" -/* +/* * All these globals require sm_mutex to access in THREADED_RTS mode. */ StgIndStatic *dyn_caf_list = NULL; @@ -54,9 +54,22 @@ generation *generations = NULL; /* all the generations */ generation *g0 = NULL; /* generation 0, for convenience */ generation *oldest_gen = NULL; /* oldest generation, for convenience */ -nursery *nurseries = NULL; /* array of nurseries, size == n_capabilities */ +/* + * Array of nurseries, size == n_capabilities + * + * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes) + * This is chosen to be the same convention as capabilities[i], so + * that when not using nursery chunks (+RTS -n), we just map + * capabilities to nurseries 1:1. + */ +nursery *nurseries = NULL; uint32_t n_nurseries; -volatile StgWord next_nursery = 0; + +/* + * When we are using nursery chunks, we need a separate next_nursery + * pointer for each NUMA node. + */ +volatile StgWord next_nursery[MAX_NUMA_NODES]; #ifdef THREADED_RTS /* @@ -104,7 +117,7 @@ initGeneration (generation *gen, int g) void initStorage (void) { - uint32_t g; + uint32_t g, n; if (generations != NULL) { // multi-init protection @@ -120,22 +133,22 @@ initStorage (void) ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLOCKING_QUEUE_CLEAN_info)); ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure)); ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure)); - + if (RtsFlags.GcFlags.maxHeapSize != 0 && - RtsFlags.GcFlags.heapSizeSuggestion > + RtsFlags.GcFlags.heapSizeSuggestion > RtsFlags.GcFlags.maxHeapSize) { RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion; } if (RtsFlags.GcFlags.maxHeapSize != 0 && - RtsFlags.GcFlags.minAllocAreaSize > + RtsFlags.GcFlags.minAllocAreaSize > RtsFlags.GcFlags.maxHeapSize) { errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)"); RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize; } initBlockAllocator(); - + #if defined(THREADED_RTS) initMutex(&sm_mutex); #endif @@ -143,7 +156,7 @@ initStorage (void) ACQUIRE_SM_LOCK; /* allocate generation info array */ - generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations + generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations * sizeof(struct generation_), "initStorage: gens"); @@ -161,7 +174,7 @@ initStorage (void) generations[g].to = &generations[g+1]; } oldest_gen->to = oldest_gen; - + /* The oldest generation has one step. */ if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) { if (RtsFlags.GcFlags.generations == 1) { @@ -178,7 +191,7 @@ initStorage (void) dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST; debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST; revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST; - + if (RtsFlags.GcFlags.largeAllocLim > 0) { large_alloc_lim = RtsFlags.GcFlags.largeAllocLim * BLOCK_SIZE_W; } else { @@ -196,7 +209,9 @@ initStorage (void) N = 0; - next_nursery = 0; + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + next_nursery[n] = n; + } storageAddCapabilities(0, n_capabilities); IF_DEBUG(gc, statDescribeGens()); @@ -257,7 +272,8 @@ void storageAddCapabilities (uint32_t from, uint32_t to) // allocate a block for each mut list for (n = from; n < to; n++) { for (g = 1; g < RtsFlags.GcFlags.generations; g++) { - capabilities[n]->mut_lists[g] = allocBlock(); + capabilities[n]->mut_lists[g] = + allocBlockOnNode(capNoToNumaNode(n)); } } @@ -526,7 +542,7 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf) -------------------------------------------------------------------------- */ static bdescr * -allocNursery (bdescr *tail, W_ blocks) +allocNursery (uint32_t node, bdescr *tail, W_ blocks) { bdescr *bd = NULL; W_ i, n; @@ -542,7 +558,7 @@ allocNursery (bdescr *tail, W_ blocks) // allocLargeChunk will prefer large chunks, but will pick up // small chunks if there are any available. We must allow // single blocks here to avoid fragmentation (#7257) - bd = allocLargeChunk(1, n); + bd = allocLargeChunkOnNode(node, 1, n); n = bd->blocks; blocks -= n; @@ -584,6 +600,7 @@ assignNurseryToCapability (Capability *cap, uint32_t n) cap->r.rCurrentNursery = nurseries[n].blocks; newNurseryBlock(nurseries[n].blocks); cap->r.rCurrentAlloc = NULL; + ASSERT(cap->r.rCurrentNursery->node == cap->node); } /* @@ -593,16 +610,18 @@ assignNurseryToCapability (Capability *cap, uint32_t n) static void assignNurseriesToCapabilities (uint32_t from, uint32_t to) { - uint32_t i; + uint32_t i, node; for (i = from; i < to; i++) { - assignNurseryToCapability(capabilities[i], next_nursery++); + node = capabilities[i]->node; + assignNurseryToCapability(capabilities[i], next_nursery[node]); + next_nursery[node] += RtsFlags.GcFlags.nNumaNodes; } } static void allocNurseries (uint32_t from, uint32_t to) -{ +{ uint32_t i; memcount n_blocks; @@ -613,24 +632,28 @@ allocNurseries (uint32_t from, uint32_t to) } for (i = from; i < to; i++) { - nurseries[i].blocks = allocNursery(NULL, n_blocks); + nurseries[i].blocks = allocNursery(capNoToNumaNode(i), NULL, n_blocks); nurseries[i].n_blocks = n_blocks; } } - + void resetNurseries (void) { - next_nursery = 0; + uint32_t n; + + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + next_nursery[n] = n; + } assignNurseriesToCapabilities(0, n_capabilities); #ifdef DEBUG bdescr *bd; - uint32_t n; for (n = 0; n < n_nurseries; n++) { for (bd = nurseries[n].blocks; bd; bd = bd->link) { ASSERT(bd->gen_no == 0); ASSERT(bd->gen == g0); + ASSERT(bd->node == capNoToNumaNode(n)); IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE)); } } @@ -649,56 +672,54 @@ countNurseryBlocks (void) return blocks; } -static void -resizeNursery (nursery *nursery, W_ blocks) -{ - bdescr *bd; - W_ nursery_blocks; - - nursery_blocks = nursery->n_blocks; - if (nursery_blocks == blocks) return; - - if (nursery_blocks < blocks) { - debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", - blocks); - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); - } - else { - bdescr *next_bd; - - debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks", - blocks); - - bd = nursery->blocks; - while (nursery_blocks > blocks) { - next_bd = bd->link; - next_bd->u.back = NULL; - nursery_blocks -= bd->blocks; // might be a large block - freeGroup(bd); - bd = next_bd; - } - nursery->blocks = bd; - // might have gone just under, by freeing a large block, so make - // up the difference. - if (nursery_blocks < blocks) { - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); - } - } - - nursery->n_blocks = blocks; - ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks); -} - -// +// // Resize each of the nurseries to the specified size. // static void resizeNurseriesEach (W_ blocks) { - uint32_t i; + uint32_t i, node; + bdescr *bd; + W_ nursery_blocks; + nursery *nursery; for (i = 0; i < n_nurseries; i++) { - resizeNursery(&nurseries[i], blocks); + nursery = &nurseries[i]; + nursery_blocks = nursery->n_blocks; + if (nursery_blocks == blocks) continue; + + node = capNoToNumaNode(i); + if (nursery_blocks < blocks) { + debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", + blocks); + nursery->blocks = allocNursery(node, nursery->blocks, + blocks-nursery_blocks); + } + else + { + bdescr *next_bd; + + debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks", + blocks); + + bd = nursery->blocks; + while (nursery_blocks > blocks) { + next_bd = bd->link; + next_bd->u.back = NULL; + nursery_blocks -= bd->blocks; // might be a large block + freeGroup(bd); + bd = next_bd; + } + nursery->blocks = bd; + // might have gone just under, by freeing a large block, so make + // up the difference. + if (nursery_blocks < blocks) { + nursery->blocks = allocNursery(node, nursery->blocks, + blocks-nursery_blocks); + } + } + nursery->n_blocks = blocks; + ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks); } } @@ -716,7 +737,7 @@ resizeNurseriesFixed (void) resizeNurseriesEach(blocks); } -// +// // Resize the nurseries to the total specified size. // void @@ -731,18 +752,42 @@ rtsBool getNewNursery (Capability *cap) { StgWord i; + uint32_t node = cap->node; + uint32_t n; for(;;) { - i = next_nursery; - if (i >= n_nurseries) { + i = next_nursery[node]; + if (i < n_nurseries) { + if (cas(&next_nursery[node], i, + i+RtsFlags.GcFlags.nNumaNodes) == i) { + assignNurseryToCapability(cap, i); + return rtsTrue; + } + } else if (RtsFlags.GcFlags.nNumaNodes > 1) { + // Try to find an unused nursery chunk on other nodes. We'll get + // remote memory, but the rationale is that avoiding GC is better + // than avoiding remote memory access. + rtsBool lost = rtsFalse; + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + if (n == node) continue; + i = next_nursery[n]; + if (i < n_nurseries) { + if (cas(&next_nursery[n], i, + i+RtsFlags.GcFlags.nNumaNodes) == i) { + assignNurseryToCapability(cap, i); + return rtsTrue; + } else { + lost = rtsTrue; /* lost a race */ + } + } + } + if (!lost) return rtsFalse; + } else { return rtsFalse; } - if (cas(&next_nursery, i, i+1) == i) { - assignNurseryToCapability(cap, i); - return rtsTrue; - } } } + /* ----------------------------------------------------------------------------- move_STACK is called to update the TSO structure after it has been moved from one place to another. @@ -753,8 +798,8 @@ move_STACK (StgStack *src, StgStack *dest) { ptrdiff_t diff; - // relocate the stack pointer... - diff = (StgPtr)dest - (StgPtr)src; // In *words* + // relocate the stack pointer... + diff = (StgPtr)dest - (StgPtr)src; // In *words* dest->sp = (StgPtr)dest->sp + diff; } @@ -818,7 +863,7 @@ allocate (Capability *cap, W_ n) } ACQUIRE_SM_LOCK - bd = allocGroup(req_blocks); + bd = allocGroupOnNode(cap->node,req_blocks); dbl_link_onto(bd, &g0->large_objects); g0->n_large_blocks += bd->blocks; // might be larger than req_blocks g0->n_new_large_words += n; @@ -834,19 +879,19 @@ allocate (Capability *cap, W_ n) bd = cap->r.rCurrentAlloc; if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { - + if (bd) finishedNurseryBlock(cap,bd); // The CurrentAlloc block is full, we need to find another // one. First, we try taking the next block from the // nursery: bd = cap->r.rCurrentNursery->link; - + if (bd == NULL) { // The nursery is empty: allocate a fresh block (we can't // fail here). ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocBlockOnNode(cap->node); cap->r.rNursery->n_blocks++; RELEASE_SM_LOCK; initBdescr(bd, g0, g0); @@ -944,7 +989,7 @@ allocatePinned (Capability *cap, W_ n) } bd = cap->pinned_object_block; - + // If we don't have a block of pinned objects yet, or the current // one isn't large enough to hold the new object, get a new one. if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) { @@ -964,24 +1009,13 @@ allocatePinned (Capability *cap, W_ n) // objects scale really badly if we do this). // // So first, we try taking the next block from the nursery, in - // the same way as allocate(), but note that we can only take - // an *empty* block, because we're about to mark it as - // BF_PINNED | BF_LARGE. + // the same way as allocate(). bd = cap->r.rCurrentNursery->link; - if (bd == NULL) { // must be empty! - // The nursery is empty, or the next block is non-empty: - // allocate a fresh block (we can't fail here). - - // XXX in the case when the next nursery block is - // non-empty we aren't exerting any pressure to GC soon, - // so if this case ever happens then we could in theory - // keep allocating for ever without calling the GC. We - // can't bump g0->n_new_large_words because that will be - // counted towards allocation, and we're already counting - // our pinned obects as allocation in - // collect_pinned_object_blocks in the GC. + if (bd == NULL) { + // The nursery is empty: allocate a fresh block (we can't fail + // here). ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocBlockOnNode(cap->node); RELEASE_SM_LOCK; initBdescr(bd, g0, g0); } else { @@ -1217,13 +1251,13 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g) * that will be collected next time will therefore need twice as many * blocks since all the data will be copied. */ -extern W_ +extern W_ calcNeeded (rtsBool force_major, memcount *blocks_needed) { W_ needed = 0, blocks; uint32_t g, N; generation *gen; - + if (force_major) { N = RtsFlags.GcFlags.generations - 1; } else { @@ -1238,7 +1272,7 @@ calcNeeded (rtsBool force_major, memcount *blocks_needed) // we need at least this much space needed += blocks; - + // are we collecting this gen? if (g == 0 || // always collect gen 0 blocks > gen->max_blocks) |