diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/sm/BlockAlloc.c | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/sm/BlockAlloc.c')
-rw-r--r-- | rts/sm/BlockAlloc.c | 350 |
1 files changed, 216 insertions, 134 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index ff1a6460a4..c2859b0c15 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -7,11 +7,11 @@ * This is the architecture independent part of the block allocator. * It requires only the following support from the operating system: * - * void *getMBlock(uint32_t n); + * void *getMBlocks(uint32_t n); * * returns the address of an n*MBLOCK_SIZE region of memory, aligned on * an MBLOCK_SIZE boundary. There are no other restrictions on the - * addresses of memory returned by getMBlock(). + * addresses of memory returned by getMBlocks(). * * ---------------------------------------------------------------------------*/ @@ -25,7 +25,7 @@ #include <string.h> -static void initMBlock(void *mblock); +static void initMBlock(void *mblock, uint32_t node); /* ----------------------------------------------------------------------------- @@ -158,28 +158,55 @@ static void initMBlock(void *mblock); // In THREADED_RTS mode, the free list is protected by sm_mutex. -static bdescr *free_list[NUM_FREE_LISTS]; -static bdescr *free_mblock_list; +static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS]; +static bdescr *free_mblock_list[MAX_NUMA_NODES]; W_ n_alloc_blocks; // currently allocated blocks W_ hw_alloc_blocks; // high-water allocated blocks +W_ n_alloc_blocks_by_node[MAX_NUMA_NODES]; + /* ----------------------------------------------------------------------------- Initialisation -------------------------------------------------------------------------- */ void initBlockAllocator(void) { - uint32_t i; - for (i=0; i < NUM_FREE_LISTS; i++) { - free_list[i] = NULL; + uint32_t i, node; + for (node = 0; node < MAX_NUMA_NODES; node++) { + for (i=0; i < NUM_FREE_LISTS; i++) { + free_list[node][i] = NULL; + } + free_mblock_list[node] = NULL; + n_alloc_blocks_by_node[node] = 0; } - free_mblock_list = NULL; n_alloc_blocks = 0; hw_alloc_blocks = 0; } /* ----------------------------------------------------------------------------- + Accounting + -------------------------------------------------------------------------- */ + +STATIC_INLINE +void recordAllocatedBlocks(uint32_t node, uint32_t n) +{ + n_alloc_blocks += n; + n_alloc_blocks_by_node[node] += n; + if (n > 0 && n_alloc_blocks > hw_alloc_blocks) { + hw_alloc_blocks = n_alloc_blocks; + } +} + +STATIC_INLINE +void recordFreedBlocks(uint32_t node, uint32_t n) +{ + ASSERT(n_alloc_blocks >= n); + n_alloc_blocks -= n; + n_alloc_blocks_by_node[node] -= n; +} + +/* ----------------------------------------------------------------------------- Allocation -------------------------------------------------------------------------- */ @@ -248,14 +275,14 @@ log_2_ceil(W_ n) } STATIC_INLINE void -free_list_insert (bdescr *bd) +free_list_insert (uint32_t node, bdescr *bd) { uint32_t ln; ASSERT(bd->blocks < BLOCKS_PER_MBLOCK); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); } @@ -284,18 +311,18 @@ setup_tail (bdescr *bd) // Take a free block group bd, and split off a group of size n from // it. Adjust the free list as necessary, and return the new group. static bdescr * -split_free_block (bdescr *bd, W_ n, uint32_t ln) +split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln) { bdescr *fg; // free group ASSERT(bd->blocks > n); - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); fg = bd + bd->blocks - n; // take n blocks off the end fg->blocks = n; bd->blocks -= n; setup_tail(bd); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); return fg; } @@ -304,7 +331,7 @@ split_free_block (bdescr *bd, W_ n, uint32_t ln) * initGroup afterwards. */ static bdescr * -alloc_mega_group (StgWord mblocks) +alloc_mega_group (uint32_t node, StgWord mblocks) { bdescr *best, *bd, *prev; StgWord n; @@ -313,14 +340,14 @@ alloc_mega_group (StgWord mblocks) best = NULL; prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) { if (bd->blocks == n) { if (prev) { prev->link = bd->link; } else { - free_mblock_list = bd->link; + free_mblock_list[node] = bd->link; } return bd; } @@ -341,12 +368,17 @@ alloc_mega_group (StgWord mblocks) (best_mblocks-mblocks)*MBLOCK_SIZE); best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks); - initMBlock(MBLOCK_ROUND_DOWN(bd)); + initMBlock(MBLOCK_ROUND_DOWN(bd), node); } else { - void *mblock = getMBlocks(mblocks); - initMBlock(mblock); // only need to init the 1st one + void *mblock; + if (RtsFlags.GcFlags.numa) { + mblock = getMBlocksOnNode(node, mblocks); + } else { + mblock = getMBlocks(mblocks); + } + initMBlock(mblock, node); // only need to init the 1st one bd = FIRST_BDESCR(mblock); } bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks); @@ -354,7 +386,7 @@ alloc_mega_group (StgWord mblocks) } bdescr * -allocGroup (W_ n) +allocGroupOnNode (uint32_t node, W_ n) { bdescr *bd, *rem; StgWord ln; @@ -369,21 +401,19 @@ allocGroup (W_ n) // n_alloc_blocks doesn't count the extra blocks we get in a // megablock group. - n_alloc_blocks += mblocks * BLOCKS_PER_MBLOCK; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); - bd = alloc_mega_group(mblocks); + bd = alloc_mega_group(node, mblocks); // only the bdescrs of the first MB are required to be initialised initGroup(bd); goto finish; } - n_alloc_blocks += n; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, n); ln = log_2_ceil(n); - while (ln < NUM_FREE_LISTS && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) { ln++; } @@ -397,27 +427,27 @@ allocGroup (W_ n) } #endif - bd = alloc_mega_group(1); + bd = alloc_mega_group(node,1); bd->blocks = n; initGroup(bd); // we know the group will fit rem = bd + n; rem->blocks = BLOCKS_PER_MBLOCK-n; - initGroup(rem); // init the slop - n_alloc_blocks += rem->blocks; + initGroup(rem); // init the slop + recordAllocatedBlocks(node,rem->blocks); freeGroup(rem); // add the slop on to the free list goto finish; } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks == n) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else if (bd->blocks > n) // block too big... { - bd = split_free_block(bd, n, ln); + bd = split_free_block(bd, node, n, ln); ASSERT(bd->blocks == n); initGroup(bd); } @@ -432,6 +462,26 @@ finish: return bd; } +STATIC_INLINE +uint32_t nodeWithLeastBlocks (void) +{ + uint32_t node = 0, i; + uint32_t min_blocks = n_alloc_blocks_by_node[0]; + for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) { + if (n_alloc_blocks_by_node[i] < min_blocks) { + min_blocks = n_alloc_blocks_by_node[i]; + node = i; + } + } + return node; +} + +bdescr* allocGroup (W_ n) +{ + return allocGroupOnNode(nodeWithLeastBlocks(),n); +} + + // // Allocate a chunk of blocks that is at least min and at most max // blocks in size. This API is used by the nursery allocator that @@ -448,8 +498,7 @@ finish: // fragmentation, but we make sure that we allocate large blocks // preferably if there are any. // -bdescr * -allocLargeChunk (W_ min, W_ max) +bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max) { bdescr *bd; StgWord ln, lnmax; @@ -461,34 +510,38 @@ allocLargeChunk (W_ min, W_ max) ln = log_2_ceil(min); lnmax = log_2_ceil(max); - while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) { ln++; } if (ln == NUM_FREE_LISTS || ln == lnmax) { - return allocGroup(max); + return allocGroupOnNode(node,max); } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks <= max) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else // block too big... { - bd = split_free_block(bd, max, ln); + bd = split_free_block(bd, node, max, ln); ASSERT(bd->blocks == max); initGroup(bd); } - n_alloc_blocks += bd->blocks; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, bd->blocks); IF_DEBUG(sanity, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE)); IF_DEBUG(sanity, checkFreeListSanity()); return bd; } +bdescr* allocLargeChunk (W_ min, W_ max) +{ + return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max); +} + bdescr * allocGroup_lock(W_ n) { @@ -500,17 +553,31 @@ allocGroup_lock(W_ n) } bdescr * -allocBlock(void) +allocBlock_lock(void) { - return allocGroup(1); + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlock(); + RELEASE_SM_LOCK; + return bd; } bdescr * -allocBlock_lock(void) +allocGroupOnNode_lock(uint32_t node, W_ n) { bdescr *bd; ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocGroupOnNode(node,n); + RELEASE_SM_LOCK; + return bd; +} + +bdescr * +allocBlockOnNode_lock(uint32_t node) +{ + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlockOnNode(node); RELEASE_SM_LOCK; return bd; } @@ -542,11 +609,13 @@ static void free_mega_group (bdescr *mg) { bdescr *bd, *prev; + uint32_t node; // Find the right place in the free list. free_mblock_list is // sorted by *address*, not by size as the free_list is. prev = NULL; - bd = free_mblock_list; + node = mg->node; + bd = free_mblock_list[node]; while (bd && bd->start < mg->start) { prev = bd; bd = bd->link; @@ -561,8 +630,8 @@ free_mega_group (bdescr *mg) } else { - mg->link = free_mblock_list; - free_mblock_list = mg; + mg->link = free_mblock_list[node]; + free_mblock_list[node] = mg; } // coalesce forwards coalesce_mblocks(mg); @@ -575,12 +644,15 @@ void freeGroup(bdescr *p) { StgWord ln; + uint32_t node; - // Todo: not true in multithreaded GC + // not true in multithreaded GC: // ASSERT_SM_LOCK(); ASSERT(p->free != (P_)-1); + node = p->node; + p->free = (void *)-1; /* indicates that this block is free */ p->gen = NULL; p->gen_no = 0; @@ -597,14 +669,13 @@ freeGroup(bdescr *p) // If this is an mgroup, make sure it has the right number of blocks ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks)); - n_alloc_blocks -= mblocks * BLOCKS_PER_MBLOCK; + recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); free_mega_group(p); return; } - ASSERT(n_alloc_blocks >= p->blocks); - n_alloc_blocks -= p->blocks; + recordFreedBlocks(node, p->blocks); // coalesce forwards { @@ -614,7 +685,7 @@ freeGroup(bdescr *p) { p->blocks += next->blocks; ln = log_2(next->blocks); - dbl_link_remove(next, &free_list[ln]); + dbl_link_remove(next, &free_list[node][ln]); if (p->blocks == BLOCKS_PER_MBLOCK) { free_mega_group(p); @@ -634,7 +705,7 @@ freeGroup(bdescr *p) if (prev->free == (P_)-1) { ln = log_2(prev->blocks); - dbl_link_remove(prev, &free_list[ln]); + dbl_link_remove(prev, &free_list[node][ln]); prev->blocks += p->blocks; if (prev->blocks >= BLOCKS_PER_MBLOCK) { @@ -646,7 +717,7 @@ freeGroup(bdescr *p) } setup_tail(p); - free_list_insert(p); + free_list_insert(node,p); IF_DEBUG(sanity, checkFreeListSanity()); } @@ -679,7 +750,7 @@ freeChain_lock(bdescr *bd) } static void -initMBlock(void *mblock) +initMBlock(void *mblock, uint32_t node) { bdescr *bd; StgWord8 *block; @@ -695,6 +766,7 @@ initMBlock(void *mblock) for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1, block += BLOCK_SIZE) { bd->start = (void*)block; + bd->node = node; } } @@ -734,28 +806,32 @@ countAllocdBlocks(bdescr *bd) void returnMemoryToOS(uint32_t n /* megablocks */) { - static bdescr *bd; + bdescr *bd; + uint32_t node; StgWord size; - bd = free_mblock_list; - while ((n > 0) && (bd != NULL)) { - size = BLOCKS_TO_MBLOCKS(bd->blocks); - if (size > n) { - StgWord newSize = size - n; - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - freeAddr += newSize * MBLOCK_SIZE; - bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); - freeMBlocks(freeAddr, n); - n = 0; - } - else { - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - n -= size; - bd = bd->link; - freeMBlocks(freeAddr, size); + // ToDo: not fair, we free all the memory starting with node 0. + for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) { + bd = free_mblock_list[node]; + while ((n > 0) && (bd != NULL)) { + size = BLOCKS_TO_MBLOCKS(bd->blocks); + if (size > n) { + StgWord newSize = size - n; + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + freeAddr += newSize * MBLOCK_SIZE; + bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); + freeMBlocks(freeAddr, n); + n = 0; + } + else { + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + n -= size; + bd = bd->link; + freeMBlocks(freeAddr, size); + } } + free_mblock_list[node] = bd; } - free_mblock_list = bd; // Ask the OS to release any address space portion // that was associated with the just released MBlocks @@ -797,68 +873,71 @@ checkFreeListSanity(void) { bdescr *bd, *prev; StgWord ln, min; + uint32_t node; - - min = 1; - for (ln = 0; ln < NUM_FREE_LISTS; ln++) { - IF_DEBUG(block_alloc, - debugBelch("free block list [%" FMT_Word "]:\n", ln)); - - prev = NULL; - for (bd = free_list[ln]; bd != NULL; prev = bd, bd = bd->link) - { + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + min = 1; + for (ln = 0; ln < NUM_FREE_LISTS; ln++) { IF_DEBUG(block_alloc, - debugBelch("group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); - ASSERT(bd->free == (P_)-1); - ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); - ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); - ASSERT(bd->link != bd); // catch easy loops - - check_tail(bd); - - if (prev) - ASSERT(bd->u.back == prev); - else - ASSERT(bd->u.back == NULL); + debugBelch("free block list [%" FMT_Word "]:\n", ln)); + prev = NULL; + for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link) { - bdescr *next; - next = bd + bd->blocks; - if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + IF_DEBUG(block_alloc, + debugBelch("group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); + ASSERT(bd->free == (P_)-1); + ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); + ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); + ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->node == node); + + check_tail(bd); + + if (prev) + ASSERT(bd->u.back == prev); + else + ASSERT(bd->u.back == NULL); + { - ASSERT(next->free != (P_)-1); + bdescr *next; + next = bd + bd->blocks; + if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + { + ASSERT(next->free != (P_)-1); + } } } + min = min << 1; } - min = min << 1; - } - prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) - { - IF_DEBUG(block_alloc, - debugBelch("mega group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); + prev = NULL; + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) + { + IF_DEBUG(block_alloc, + debugBelch("mega group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); - ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->link != bd); // catch easy loops - if (bd->link != NULL) - { - // make sure the list is sorted - ASSERT(bd->start < bd->link->start); - } + if (bd->link != NULL) + { + // make sure the list is sorted + ASSERT(bd->start < bd->link->start); + } - ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); - ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) - == bd->blocks); + ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); + ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) + == bd->blocks); - // make sure we're fully coalesced - if (bd->link != NULL) - { - ASSERT(MBLOCK_ROUND_DOWN(bd->link) != - (StgWord8*)MBLOCK_ROUND_DOWN(bd) + - BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + // make sure we're fully coalesced + if (bd->link != NULL) + { + ASSERT(MBLOCK_ROUND_DOWN(bd->link) != + (StgWord8*)MBLOCK_ROUND_DOWN(bd) + + BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + } } } } @@ -869,18 +948,21 @@ countFreeList(void) bdescr *bd; W_ total_blocks = 0; StgWord ln; + uint32_t node; - for (ln=0; ln < NUM_FREE_LISTS; ln++) { - for (bd = free_list[ln]; bd != NULL; bd = bd->link) { - total_blocks += bd->blocks; + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (ln=0; ln < NUM_FREE_LISTS; ln++) { + for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) { + total_blocks += bd->blocks; + } + } + for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) { + total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); + // The caller of this function, memInventory(), expects to match + // the total number of blocks in the system against mblocks * + // BLOCKS_PER_MBLOCK, so we must subtract the space for the + // block descriptors from *every* mblock. } - } - for (bd = free_mblock_list; bd != NULL; bd = bd->link) { - total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); - // The caller of this function, memInventory(), expects to match - // the total number of blocks in the system against mblocks * - // BLOCKS_PER_MBLOCK, so we must subtract the space for the - // block descriptors from *every* mblock. } return total_blocks; } |