summaryrefslogtreecommitdiff
path: root/rts/sm
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/sm
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/sm')
-rw-r--r--rts/sm/BlockAlloc.c350
-rw-r--r--rts/sm/BlockAlloc.h1
-rw-r--r--rts/sm/GC.c8
-rw-r--r--rts/sm/GCUtils.c15
-rw-r--r--rts/sm/GCUtils.h14
-rw-r--r--rts/sm/MBlock.c19
-rw-r--r--rts/sm/MarkStack.h1
-rw-r--r--rts/sm/OSMem.h4
-rw-r--r--rts/sm/Storage.c232
9 files changed, 398 insertions, 246 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index ff1a6460a4..c2859b0c15 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -7,11 +7,11 @@
* This is the architecture independent part of the block allocator.
* It requires only the following support from the operating system:
*
- * void *getMBlock(uint32_t n);
+ * void *getMBlocks(uint32_t n);
*
* returns the address of an n*MBLOCK_SIZE region of memory, aligned on
* an MBLOCK_SIZE boundary. There are no other restrictions on the
- * addresses of memory returned by getMBlock().
+ * addresses of memory returned by getMBlocks().
*
* ---------------------------------------------------------------------------*/
@@ -25,7 +25,7 @@
#include <string.h>
-static void initMBlock(void *mblock);
+static void initMBlock(void *mblock, uint32_t node);
/* -----------------------------------------------------------------------------
@@ -158,28 +158,55 @@ static void initMBlock(void *mblock);
// In THREADED_RTS mode, the free list is protected by sm_mutex.
-static bdescr *free_list[NUM_FREE_LISTS];
-static bdescr *free_mblock_list;
+static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
+static bdescr *free_mblock_list[MAX_NUMA_NODES];
W_ n_alloc_blocks; // currently allocated blocks
W_ hw_alloc_blocks; // high-water allocated blocks
+W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
+
/* -----------------------------------------------------------------------------
Initialisation
-------------------------------------------------------------------------- */
void initBlockAllocator(void)
{
- uint32_t i;
- for (i=0; i < NUM_FREE_LISTS; i++) {
- free_list[i] = NULL;
+ uint32_t i, node;
+ for (node = 0; node < MAX_NUMA_NODES; node++) {
+ for (i=0; i < NUM_FREE_LISTS; i++) {
+ free_list[node][i] = NULL;
+ }
+ free_mblock_list[node] = NULL;
+ n_alloc_blocks_by_node[node] = 0;
}
- free_mblock_list = NULL;
n_alloc_blocks = 0;
hw_alloc_blocks = 0;
}
/* -----------------------------------------------------------------------------
+ Accounting
+ -------------------------------------------------------------------------- */
+
+STATIC_INLINE
+void recordAllocatedBlocks(uint32_t node, uint32_t n)
+{
+ n_alloc_blocks += n;
+ n_alloc_blocks_by_node[node] += n;
+ if (n > 0 && n_alloc_blocks > hw_alloc_blocks) {
+ hw_alloc_blocks = n_alloc_blocks;
+ }
+}
+
+STATIC_INLINE
+void recordFreedBlocks(uint32_t node, uint32_t n)
+{
+ ASSERT(n_alloc_blocks >= n);
+ n_alloc_blocks -= n;
+ n_alloc_blocks_by_node[node] -= n;
+}
+
+/* -----------------------------------------------------------------------------
Allocation
-------------------------------------------------------------------------- */
@@ -248,14 +275,14 @@ log_2_ceil(W_ n)
}
STATIC_INLINE void
-free_list_insert (bdescr *bd)
+free_list_insert (uint32_t node, bdescr *bd)
{
uint32_t ln;
ASSERT(bd->blocks < BLOCKS_PER_MBLOCK);
ln = log_2(bd->blocks);
- dbl_link_onto(bd, &free_list[ln]);
+ dbl_link_onto(bd, &free_list[node][ln]);
}
@@ -284,18 +311,18 @@ setup_tail (bdescr *bd)
// Take a free block group bd, and split off a group of size n from
// it. Adjust the free list as necessary, and return the new group.
static bdescr *
-split_free_block (bdescr *bd, W_ n, uint32_t ln)
+split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
{
bdescr *fg; // free group
ASSERT(bd->blocks > n);
- dbl_link_remove(bd, &free_list[ln]);
+ dbl_link_remove(bd, &free_list[node][ln]);
fg = bd + bd->blocks - n; // take n blocks off the end
fg->blocks = n;
bd->blocks -= n;
setup_tail(bd);
ln = log_2(bd->blocks);
- dbl_link_onto(bd, &free_list[ln]);
+ dbl_link_onto(bd, &free_list[node][ln]);
return fg;
}
@@ -304,7 +331,7 @@ split_free_block (bdescr *bd, W_ n, uint32_t ln)
* initGroup afterwards.
*/
static bdescr *
-alloc_mega_group (StgWord mblocks)
+alloc_mega_group (uint32_t node, StgWord mblocks)
{
bdescr *best, *bd, *prev;
StgWord n;
@@ -313,14 +340,14 @@ alloc_mega_group (StgWord mblocks)
best = NULL;
prev = NULL;
- for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link)
+ for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
{
if (bd->blocks == n)
{
if (prev) {
prev->link = bd->link;
} else {
- free_mblock_list = bd->link;
+ free_mblock_list[node] = bd->link;
}
return bd;
}
@@ -341,12 +368,17 @@ alloc_mega_group (StgWord mblocks)
(best_mblocks-mblocks)*MBLOCK_SIZE);
best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks);
- initMBlock(MBLOCK_ROUND_DOWN(bd));
+ initMBlock(MBLOCK_ROUND_DOWN(bd), node);
}
else
{
- void *mblock = getMBlocks(mblocks);
- initMBlock(mblock); // only need to init the 1st one
+ void *mblock;
+ if (RtsFlags.GcFlags.numa) {
+ mblock = getMBlocksOnNode(node, mblocks);
+ } else {
+ mblock = getMBlocks(mblocks);
+ }
+ initMBlock(mblock, node); // only need to init the 1st one
bd = FIRST_BDESCR(mblock);
}
bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
@@ -354,7 +386,7 @@ alloc_mega_group (StgWord mblocks)
}
bdescr *
-allocGroup (W_ n)
+allocGroupOnNode (uint32_t node, W_ n)
{
bdescr *bd, *rem;
StgWord ln;
@@ -369,21 +401,19 @@ allocGroup (W_ n)
// n_alloc_blocks doesn't count the extra blocks we get in a
// megablock group.
- n_alloc_blocks += mblocks * BLOCKS_PER_MBLOCK;
- if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+ recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
- bd = alloc_mega_group(mblocks);
+ bd = alloc_mega_group(node, mblocks);
// only the bdescrs of the first MB are required to be initialised
initGroup(bd);
goto finish;
}
- n_alloc_blocks += n;
- if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+ recordAllocatedBlocks(node, n);
ln = log_2_ceil(n);
- while (ln < NUM_FREE_LISTS && free_list[ln] == NULL) {
+ while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) {
ln++;
}
@@ -397,27 +427,27 @@ allocGroup (W_ n)
}
#endif
- bd = alloc_mega_group(1);
+ bd = alloc_mega_group(node,1);
bd->blocks = n;
initGroup(bd); // we know the group will fit
rem = bd + n;
rem->blocks = BLOCKS_PER_MBLOCK-n;
- initGroup(rem); // init the slop
- n_alloc_blocks += rem->blocks;
+ initGroup(rem); // init the slop
+ recordAllocatedBlocks(node,rem->blocks);
freeGroup(rem); // add the slop on to the free list
goto finish;
}
- bd = free_list[ln];
+ bd = free_list[node][ln];
if (bd->blocks == n) // exactly the right size!
{
- dbl_link_remove(bd, &free_list[ln]);
+ dbl_link_remove(bd, &free_list[node][ln]);
initGroup(bd);
}
else if (bd->blocks > n) // block too big...
{
- bd = split_free_block(bd, n, ln);
+ bd = split_free_block(bd, node, n, ln);
ASSERT(bd->blocks == n);
initGroup(bd);
}
@@ -432,6 +462,26 @@ finish:
return bd;
}
+STATIC_INLINE
+uint32_t nodeWithLeastBlocks (void)
+{
+ uint32_t node = 0, i;
+ uint32_t min_blocks = n_alloc_blocks_by_node[0];
+ for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+ if (n_alloc_blocks_by_node[i] < min_blocks) {
+ min_blocks = n_alloc_blocks_by_node[i];
+ node = i;
+ }
+ }
+ return node;
+}
+
+bdescr* allocGroup (W_ n)
+{
+ return allocGroupOnNode(nodeWithLeastBlocks(),n);
+}
+
+
//
// Allocate a chunk of blocks that is at least min and at most max
// blocks in size. This API is used by the nursery allocator that
@@ -448,8 +498,7 @@ finish:
// fragmentation, but we make sure that we allocate large blocks
// preferably if there are any.
//
-bdescr *
-allocLargeChunk (W_ min, W_ max)
+bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
{
bdescr *bd;
StgWord ln, lnmax;
@@ -461,34 +510,38 @@ allocLargeChunk (W_ min, W_ max)
ln = log_2_ceil(min);
lnmax = log_2_ceil(max);
- while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[ln] == NULL) {
+ while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) {
ln++;
}
if (ln == NUM_FREE_LISTS || ln == lnmax) {
- return allocGroup(max);
+ return allocGroupOnNode(node,max);
}
- bd = free_list[ln];
+ bd = free_list[node][ln];
if (bd->blocks <= max) // exactly the right size!
{
- dbl_link_remove(bd, &free_list[ln]);
+ dbl_link_remove(bd, &free_list[node][ln]);
initGroup(bd);
}
else // block too big...
{
- bd = split_free_block(bd, max, ln);
+ bd = split_free_block(bd, node, max, ln);
ASSERT(bd->blocks == max);
initGroup(bd);
}
- n_alloc_blocks += bd->blocks;
- if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+ recordAllocatedBlocks(node, bd->blocks);
IF_DEBUG(sanity, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
IF_DEBUG(sanity, checkFreeListSanity());
return bd;
}
+bdescr* allocLargeChunk (W_ min, W_ max)
+{
+ return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max);
+}
+
bdescr *
allocGroup_lock(W_ n)
{
@@ -500,17 +553,31 @@ allocGroup_lock(W_ n)
}
bdescr *
-allocBlock(void)
+allocBlock_lock(void)
{
- return allocGroup(1);
+ bdescr *bd;
+ ACQUIRE_SM_LOCK;
+ bd = allocBlock();
+ RELEASE_SM_LOCK;
+ return bd;
}
bdescr *
-allocBlock_lock(void)
+allocGroupOnNode_lock(uint32_t node, W_ n)
{
bdescr *bd;
ACQUIRE_SM_LOCK;
- bd = allocBlock();
+ bd = allocGroupOnNode(node,n);
+ RELEASE_SM_LOCK;
+ return bd;
+}
+
+bdescr *
+allocBlockOnNode_lock(uint32_t node)
+{
+ bdescr *bd;
+ ACQUIRE_SM_LOCK;
+ bd = allocBlockOnNode(node);
RELEASE_SM_LOCK;
return bd;
}
@@ -542,11 +609,13 @@ static void
free_mega_group (bdescr *mg)
{
bdescr *bd, *prev;
+ uint32_t node;
// Find the right place in the free list. free_mblock_list is
// sorted by *address*, not by size as the free_list is.
prev = NULL;
- bd = free_mblock_list;
+ node = mg->node;
+ bd = free_mblock_list[node];
while (bd && bd->start < mg->start) {
prev = bd;
bd = bd->link;
@@ -561,8 +630,8 @@ free_mega_group (bdescr *mg)
}
else
{
- mg->link = free_mblock_list;
- free_mblock_list = mg;
+ mg->link = free_mblock_list[node];
+ free_mblock_list[node] = mg;
}
// coalesce forwards
coalesce_mblocks(mg);
@@ -575,12 +644,15 @@ void
freeGroup(bdescr *p)
{
StgWord ln;
+ uint32_t node;
- // Todo: not true in multithreaded GC
+ // not true in multithreaded GC:
// ASSERT_SM_LOCK();
ASSERT(p->free != (P_)-1);
+ node = p->node;
+
p->free = (void *)-1; /* indicates that this block is free */
p->gen = NULL;
p->gen_no = 0;
@@ -597,14 +669,13 @@ freeGroup(bdescr *p)
// If this is an mgroup, make sure it has the right number of blocks
ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks));
- n_alloc_blocks -= mblocks * BLOCKS_PER_MBLOCK;
+ recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
free_mega_group(p);
return;
}
- ASSERT(n_alloc_blocks >= p->blocks);
- n_alloc_blocks -= p->blocks;
+ recordFreedBlocks(node, p->blocks);
// coalesce forwards
{
@@ -614,7 +685,7 @@ freeGroup(bdescr *p)
{
p->blocks += next->blocks;
ln = log_2(next->blocks);
- dbl_link_remove(next, &free_list[ln]);
+ dbl_link_remove(next, &free_list[node][ln]);
if (p->blocks == BLOCKS_PER_MBLOCK)
{
free_mega_group(p);
@@ -634,7 +705,7 @@ freeGroup(bdescr *p)
if (prev->free == (P_)-1)
{
ln = log_2(prev->blocks);
- dbl_link_remove(prev, &free_list[ln]);
+ dbl_link_remove(prev, &free_list[node][ln]);
prev->blocks += p->blocks;
if (prev->blocks >= BLOCKS_PER_MBLOCK)
{
@@ -646,7 +717,7 @@ freeGroup(bdescr *p)
}
setup_tail(p);
- free_list_insert(p);
+ free_list_insert(node,p);
IF_DEBUG(sanity, checkFreeListSanity());
}
@@ -679,7 +750,7 @@ freeChain_lock(bdescr *bd)
}
static void
-initMBlock(void *mblock)
+initMBlock(void *mblock, uint32_t node)
{
bdescr *bd;
StgWord8 *block;
@@ -695,6 +766,7 @@ initMBlock(void *mblock)
for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1,
block += BLOCK_SIZE) {
bd->start = (void*)block;
+ bd->node = node;
}
}
@@ -734,28 +806,32 @@ countAllocdBlocks(bdescr *bd)
void returnMemoryToOS(uint32_t n /* megablocks */)
{
- static bdescr *bd;
+ bdescr *bd;
+ uint32_t node;
StgWord size;
- bd = free_mblock_list;
- while ((n > 0) && (bd != NULL)) {
- size = BLOCKS_TO_MBLOCKS(bd->blocks);
- if (size > n) {
- StgWord newSize = size - n;
- char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
- freeAddr += newSize * MBLOCK_SIZE;
- bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
- freeMBlocks(freeAddr, n);
- n = 0;
- }
- else {
- char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
- n -= size;
- bd = bd->link;
- freeMBlocks(freeAddr, size);
+ // ToDo: not fair, we free all the memory starting with node 0.
+ for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) {
+ bd = free_mblock_list[node];
+ while ((n > 0) && (bd != NULL)) {
+ size = BLOCKS_TO_MBLOCKS(bd->blocks);
+ if (size > n) {
+ StgWord newSize = size - n;
+ char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
+ freeAddr += newSize * MBLOCK_SIZE;
+ bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
+ freeMBlocks(freeAddr, n);
+ n = 0;
+ }
+ else {
+ char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
+ n -= size;
+ bd = bd->link;
+ freeMBlocks(freeAddr, size);
+ }
}
+ free_mblock_list[node] = bd;
}
- free_mblock_list = bd;
// Ask the OS to release any address space portion
// that was associated with the just released MBlocks
@@ -797,68 +873,71 @@ checkFreeListSanity(void)
{
bdescr *bd, *prev;
StgWord ln, min;
+ uint32_t node;
-
- min = 1;
- for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
- IF_DEBUG(block_alloc,
- debugBelch("free block list [%" FMT_Word "]:\n", ln));
-
- prev = NULL;
- for (bd = free_list[ln]; bd != NULL; prev = bd, bd = bd->link)
- {
+ for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+ min = 1;
+ for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
IF_DEBUG(block_alloc,
- debugBelch("group at %p, length %ld blocks\n",
- bd->start, (long)bd->blocks));
- ASSERT(bd->free == (P_)-1);
- ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
- ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
- ASSERT(bd->link != bd); // catch easy loops
-
- check_tail(bd);
-
- if (prev)
- ASSERT(bd->u.back == prev);
- else
- ASSERT(bd->u.back == NULL);
+ debugBelch("free block list [%" FMT_Word "]:\n", ln));
+ prev = NULL;
+ for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link)
{
- bdescr *next;
- next = bd + bd->blocks;
- if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
+ IF_DEBUG(block_alloc,
+ debugBelch("group at %p, length %ld blocks\n",
+ bd->start, (long)bd->blocks));
+ ASSERT(bd->free == (P_)-1);
+ ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
+ ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
+ ASSERT(bd->link != bd); // catch easy loops
+ ASSERT(bd->node == node);
+
+ check_tail(bd);
+
+ if (prev)
+ ASSERT(bd->u.back == prev);
+ else
+ ASSERT(bd->u.back == NULL);
+
{
- ASSERT(next->free != (P_)-1);
+ bdescr *next;
+ next = bd + bd->blocks;
+ if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
+ {
+ ASSERT(next->free != (P_)-1);
+ }
}
}
+ min = min << 1;
}
- min = min << 1;
- }
- prev = NULL;
- for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link)
- {
- IF_DEBUG(block_alloc,
- debugBelch("mega group at %p, length %ld blocks\n",
- bd->start, (long)bd->blocks));
+ prev = NULL;
+ for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
+ {
+ IF_DEBUG(block_alloc,
+ debugBelch("mega group at %p, length %ld blocks\n",
+ bd->start, (long)bd->blocks));
- ASSERT(bd->link != bd); // catch easy loops
+ ASSERT(bd->link != bd); // catch easy loops
- if (bd->link != NULL)
- {
- // make sure the list is sorted
- ASSERT(bd->start < bd->link->start);
- }
+ if (bd->link != NULL)
+ {
+ // make sure the list is sorted
+ ASSERT(bd->start < bd->link->start);
+ }
- ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
- ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
- == bd->blocks);
+ ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
+ ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
+ == bd->blocks);
- // make sure we're fully coalesced
- if (bd->link != NULL)
- {
- ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
- (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
- BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
+ // make sure we're fully coalesced
+ if (bd->link != NULL)
+ {
+ ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
+ (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
+ BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
+ }
}
}
}
@@ -869,18 +948,21 @@ countFreeList(void)
bdescr *bd;
W_ total_blocks = 0;
StgWord ln;
+ uint32_t node;
- for (ln=0; ln < NUM_FREE_LISTS; ln++) {
- for (bd = free_list[ln]; bd != NULL; bd = bd->link) {
- total_blocks += bd->blocks;
+ for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+ for (ln=0; ln < NUM_FREE_LISTS; ln++) {
+ for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
+ total_blocks += bd->blocks;
+ }
+ }
+ for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) {
+ total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
+ // The caller of this function, memInventory(), expects to match
+ // the total number of blocks in the system against mblocks *
+ // BLOCKS_PER_MBLOCK, so we must subtract the space for the
+ // block descriptors from *every* mblock.
}
- }
- for (bd = free_mblock_list; bd != NULL; bd = bd->link) {
- total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
- // The caller of this function, memInventory(), expects to match
- // the total number of blocks in the system against mblocks *
- // BLOCKS_PER_MBLOCK, so we must subtract the space for the
- // block descriptors from *every* mblock.
}
return total_blocks;
}
diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h
index 2ba7c02c08..c26ae104e1 100644
--- a/rts/sm/BlockAlloc.h
+++ b/rts/sm/BlockAlloc.h
@@ -12,6 +12,7 @@
#include "BeginPrivate.h"
bdescr *allocLargeChunk (W_ min, W_ max);
+bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max);
/* Debugging -------------------------------------------------------------- */
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 996ce8cbce..3bfdaa25ff 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -802,7 +802,8 @@ new_gc_thread (uint32_t n, gc_thread *t)
// but can't, because it uses gct which isn't set up at this point.
// Hence, allocate a block for todo_bd manually:
{
- bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet
+ bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
+ // no lock, locks aren't initialised yet
initBdescr(bd, ws->gen, ws->gen->to);
bd->flags = BF_EVACUATED;
bd->u.scan = bd->free = bd->start;
@@ -1182,7 +1183,8 @@ prepare_collected_gen (generation *gen)
if (g != 0) {
for (i = 0; i < n_capabilities; i++) {
freeChain(capabilities[i]->mut_lists[g]);
- capabilities[i]->mut_lists[g] = allocBlock();
+ capabilities[i]->mut_lists[g] =
+ allocBlockOnNode(capNoToNumaNode(i));
}
}
@@ -1296,7 +1298,7 @@ static void
stash_mut_list (Capability *cap, uint32_t gen_no)
{
cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
- cap->mut_lists[gen_no] = allocBlock_sync();
+ cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
}
/* ----------------------------------------------------------------------------
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 5edf9dedbc..a515665d07 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -30,34 +30,33 @@
SpinLock gc_alloc_block_sync;
#endif
-bdescr *
-allocBlock_sync(void)
+bdescr* allocGroup_sync(uint32_t n)
{
bdescr *bd;
+ uint32_t node = capNoToNumaNode(gct->thread_index);
ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
- bd = allocBlock();
+ bd = allocGroupOnNode(node,n);
RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
return bd;
}
-static bdescr *
-allocGroup_sync(uint32_t n)
+bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n)
{
bdescr *bd;
ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
- bd = allocGroup(n);
+ bd = allocGroupOnNode(node,n);
RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
return bd;
}
-
static uint32_t
allocBlocks_sync(uint32_t n, bdescr **hd)
{
bdescr *bd;
uint32_t i;
+ uint32_t node = capNoToNumaNode(gct->thread_index);
ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
- bd = allocLargeChunk(1,n);
+ bd = allocLargeChunkOnNode(node,1,n);
// NB. allocLargeChunk, rather than allocGroup(n), to allocate in a
// fragmentation-friendly way.
n = bd->blocks;
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h
index 0f87eee3f1..7e5a827ce0 100644
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -18,7 +18,19 @@
#include "GCTDecl.h"
-bdescr *allocBlock_sync(void);
+bdescr* allocGroup_sync(uint32_t n);
+bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n);
+
+INLINE_HEADER bdescr *allocBlock_sync(void)
+{
+ return allocGroup_sync(1);
+}
+
+INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node)
+{
+ return allocGroupOnNode_sync(node,1);
+}
+
void freeChain_sync(bdescr *bd);
void push_scanned_block (bdescr *bd, gen_workspace *ws);
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index 440b03efa7..53999d2c4b 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -566,7 +566,7 @@ void releaseFreeMemory(void)
void *
getMBlock(void)
{
- return getMBlocks(1);
+ return getMBlocks(1);
}
// The external interface: allocate 'n' mblocks, and return the
@@ -587,6 +587,23 @@ getMBlocks(uint32_t n)
return ret;
}
+void *
+getMBlocksOnNode(uint32_t node, uint32_t n)
+{
+ void *addr = getMBlocks(n);
+#ifdef DEBUG
+ if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA
+#endif
+ osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]);
+ return addr;
+}
+
+void *
+getMBlockOnNode(uint32_t node)
+{
+ return getMBlocksOnNode(node, 1);
+}
+
void
freeMBlocks(void *addr, uint32_t n)
{
diff --git a/rts/sm/MarkStack.h b/rts/sm/MarkStack.h
index f978a32563..d90b5e47b4 100644
--- a/rts/sm/MarkStack.h
+++ b/rts/sm/MarkStack.h
@@ -15,6 +15,7 @@
#define SM_MARKSTACK_H
#include "BeginPrivate.h"
+#include "GCUtils.h"
INLINE_HEADER void
push_mark_stack(StgPtr p)
diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h
index 8518f05d1b..660942827d 100644
--- a/rts/sm/OSMem.h
+++ b/rts/sm/OSMem.h
@@ -19,6 +19,10 @@ void osFreeAllMBlocks(void);
size_t getPageSize (void);
StgWord64 getPhysicalMemorySize (void);
void setExecutable (void *p, W_ len, rtsBool exec);
+rtsBool osNumaAvailable(void);
+uint32_t osNumaNodes(void);
+StgWord osNumaMask(void);
+void osBindMBlocksToNode(void *addr, StgWord size, uint32_t node);
INLINE_HEADER size_t
roundDownToPage (size_t x)
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 717c96ae81..a9a7857d43 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -6,7 +6,7 @@
*
* Documentation on the architecture of the Storage Manager can be
* found in the online commentary:
- *
+ *
* http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage
*
* ---------------------------------------------------------------------------*/
@@ -37,7 +37,7 @@
#include "ffi.h"
-/*
+/*
* All these globals require sm_mutex to access in THREADED_RTS mode.
*/
StgIndStatic *dyn_caf_list = NULL;
@@ -54,9 +54,22 @@ generation *generations = NULL; /* all the generations */
generation *g0 = NULL; /* generation 0, for convenience */
generation *oldest_gen = NULL; /* oldest generation, for convenience */
-nursery *nurseries = NULL; /* array of nurseries, size == n_capabilities */
+/*
+ * Array of nurseries, size == n_capabilities
+ *
+ * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes)
+ * This is chosen to be the same convention as capabilities[i], so
+ * that when not using nursery chunks (+RTS -n), we just map
+ * capabilities to nurseries 1:1.
+ */
+nursery *nurseries = NULL;
uint32_t n_nurseries;
-volatile StgWord next_nursery = 0;
+
+/*
+ * When we are using nursery chunks, we need a separate next_nursery
+ * pointer for each NUMA node.
+ */
+volatile StgWord next_nursery[MAX_NUMA_NODES];
#ifdef THREADED_RTS
/*
@@ -104,7 +117,7 @@ initGeneration (generation *gen, int g)
void
initStorage (void)
{
- uint32_t g;
+ uint32_t g, n;
if (generations != NULL) {
// multi-init protection
@@ -120,22 +133,22 @@ initStorage (void)
ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLOCKING_QUEUE_CLEAN_info));
ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
-
+
if (RtsFlags.GcFlags.maxHeapSize != 0 &&
- RtsFlags.GcFlags.heapSizeSuggestion >
+ RtsFlags.GcFlags.heapSizeSuggestion >
RtsFlags.GcFlags.maxHeapSize) {
RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
}
if (RtsFlags.GcFlags.maxHeapSize != 0 &&
- RtsFlags.GcFlags.minAllocAreaSize >
+ RtsFlags.GcFlags.minAllocAreaSize >
RtsFlags.GcFlags.maxHeapSize) {
errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
}
initBlockAllocator();
-
+
#if defined(THREADED_RTS)
initMutex(&sm_mutex);
#endif
@@ -143,7 +156,7 @@ initStorage (void)
ACQUIRE_SM_LOCK;
/* allocate generation info array */
- generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
+ generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
* sizeof(struct generation_),
"initStorage: gens");
@@ -161,7 +174,7 @@ initStorage (void)
generations[g].to = &generations[g+1];
}
oldest_gen->to = oldest_gen;
-
+
/* The oldest generation has one step. */
if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
if (RtsFlags.GcFlags.generations == 1) {
@@ -178,7 +191,7 @@ initStorage (void)
dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
-
+
if (RtsFlags.GcFlags.largeAllocLim > 0) {
large_alloc_lim = RtsFlags.GcFlags.largeAllocLim * BLOCK_SIZE_W;
} else {
@@ -196,7 +209,9 @@ initStorage (void)
N = 0;
- next_nursery = 0;
+ for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+ next_nursery[n] = n;
+ }
storageAddCapabilities(0, n_capabilities);
IF_DEBUG(gc, statDescribeGens());
@@ -257,7 +272,8 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
// allocate a block for each mut list
for (n = from; n < to; n++) {
for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
- capabilities[n]->mut_lists[g] = allocBlock();
+ capabilities[n]->mut_lists[g] =
+ allocBlockOnNode(capNoToNumaNode(n));
}
}
@@ -526,7 +542,7 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
-------------------------------------------------------------------------- */
static bdescr *
-allocNursery (bdescr *tail, W_ blocks)
+allocNursery (uint32_t node, bdescr *tail, W_ blocks)
{
bdescr *bd = NULL;
W_ i, n;
@@ -542,7 +558,7 @@ allocNursery (bdescr *tail, W_ blocks)
// allocLargeChunk will prefer large chunks, but will pick up
// small chunks if there are any available. We must allow
// single blocks here to avoid fragmentation (#7257)
- bd = allocLargeChunk(1, n);
+ bd = allocLargeChunkOnNode(node, 1, n);
n = bd->blocks;
blocks -= n;
@@ -584,6 +600,7 @@ assignNurseryToCapability (Capability *cap, uint32_t n)
cap->r.rCurrentNursery = nurseries[n].blocks;
newNurseryBlock(nurseries[n].blocks);
cap->r.rCurrentAlloc = NULL;
+ ASSERT(cap->r.rCurrentNursery->node == cap->node);
}
/*
@@ -593,16 +610,18 @@ assignNurseryToCapability (Capability *cap, uint32_t n)
static void
assignNurseriesToCapabilities (uint32_t from, uint32_t to)
{
- uint32_t i;
+ uint32_t i, node;
for (i = from; i < to; i++) {
- assignNurseryToCapability(capabilities[i], next_nursery++);
+ node = capabilities[i]->node;
+ assignNurseryToCapability(capabilities[i], next_nursery[node]);
+ next_nursery[node] += RtsFlags.GcFlags.nNumaNodes;
}
}
static void
allocNurseries (uint32_t from, uint32_t to)
-{
+{
uint32_t i;
memcount n_blocks;
@@ -613,24 +632,28 @@ allocNurseries (uint32_t from, uint32_t to)
}
for (i = from; i < to; i++) {
- nurseries[i].blocks = allocNursery(NULL, n_blocks);
+ nurseries[i].blocks = allocNursery(capNoToNumaNode(i), NULL, n_blocks);
nurseries[i].n_blocks = n_blocks;
}
}
-
+
void
resetNurseries (void)
{
- next_nursery = 0;
+ uint32_t n;
+
+ for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+ next_nursery[n] = n;
+ }
assignNurseriesToCapabilities(0, n_capabilities);
#ifdef DEBUG
bdescr *bd;
- uint32_t n;
for (n = 0; n < n_nurseries; n++) {
for (bd = nurseries[n].blocks; bd; bd = bd->link) {
ASSERT(bd->gen_no == 0);
ASSERT(bd->gen == g0);
+ ASSERT(bd->node == capNoToNumaNode(n));
IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
}
}
@@ -649,56 +672,54 @@ countNurseryBlocks (void)
return blocks;
}
-static void
-resizeNursery (nursery *nursery, W_ blocks)
-{
- bdescr *bd;
- W_ nursery_blocks;
-
- nursery_blocks = nursery->n_blocks;
- if (nursery_blocks == blocks) return;
-
- if (nursery_blocks < blocks) {
- debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
- blocks);
- nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks);
- }
- else {
- bdescr *next_bd;
-
- debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
- blocks);
-
- bd = nursery->blocks;
- while (nursery_blocks > blocks) {
- next_bd = bd->link;
- next_bd->u.back = NULL;
- nursery_blocks -= bd->blocks; // might be a large block
- freeGroup(bd);
- bd = next_bd;
- }
- nursery->blocks = bd;
- // might have gone just under, by freeing a large block, so make
- // up the difference.
- if (nursery_blocks < blocks) {
- nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks);
- }
- }
-
- nursery->n_blocks = blocks;
- ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks);
-}
-
-//
+//
// Resize each of the nurseries to the specified size.
//
static void
resizeNurseriesEach (W_ blocks)
{
- uint32_t i;
+ uint32_t i, node;
+ bdescr *bd;
+ W_ nursery_blocks;
+ nursery *nursery;
for (i = 0; i < n_nurseries; i++) {
- resizeNursery(&nurseries[i], blocks);
+ nursery = &nurseries[i];
+ nursery_blocks = nursery->n_blocks;
+ if (nursery_blocks == blocks) continue;
+
+ node = capNoToNumaNode(i);
+ if (nursery_blocks < blocks) {
+ debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
+ blocks);
+ nursery->blocks = allocNursery(node, nursery->blocks,
+ blocks-nursery_blocks);
+ }
+ else
+ {
+ bdescr *next_bd;
+
+ debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
+ blocks);
+
+ bd = nursery->blocks;
+ while (nursery_blocks > blocks) {
+ next_bd = bd->link;
+ next_bd->u.back = NULL;
+ nursery_blocks -= bd->blocks; // might be a large block
+ freeGroup(bd);
+ bd = next_bd;
+ }
+ nursery->blocks = bd;
+ // might have gone just under, by freeing a large block, so make
+ // up the difference.
+ if (nursery_blocks < blocks) {
+ nursery->blocks = allocNursery(node, nursery->blocks,
+ blocks-nursery_blocks);
+ }
+ }
+ nursery->n_blocks = blocks;
+ ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks);
}
}
@@ -716,7 +737,7 @@ resizeNurseriesFixed (void)
resizeNurseriesEach(blocks);
}
-//
+//
// Resize the nurseries to the total specified size.
//
void
@@ -731,18 +752,42 @@ rtsBool
getNewNursery (Capability *cap)
{
StgWord i;
+ uint32_t node = cap->node;
+ uint32_t n;
for(;;) {
- i = next_nursery;
- if (i >= n_nurseries) {
+ i = next_nursery[node];
+ if (i < n_nurseries) {
+ if (cas(&next_nursery[node], i,
+ i+RtsFlags.GcFlags.nNumaNodes) == i) {
+ assignNurseryToCapability(cap, i);
+ return rtsTrue;
+ }
+ } else if (RtsFlags.GcFlags.nNumaNodes > 1) {
+ // Try to find an unused nursery chunk on other nodes. We'll get
+ // remote memory, but the rationale is that avoiding GC is better
+ // than avoiding remote memory access.
+ rtsBool lost = rtsFalse;
+ for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+ if (n == node) continue;
+ i = next_nursery[n];
+ if (i < n_nurseries) {
+ if (cas(&next_nursery[n], i,
+ i+RtsFlags.GcFlags.nNumaNodes) == i) {
+ assignNurseryToCapability(cap, i);
+ return rtsTrue;
+ } else {
+ lost = rtsTrue; /* lost a race */
+ }
+ }
+ }
+ if (!lost) return rtsFalse;
+ } else {
return rtsFalse;
}
- if (cas(&next_nursery, i, i+1) == i) {
- assignNurseryToCapability(cap, i);
- return rtsTrue;
- }
}
}
+
/* -----------------------------------------------------------------------------
move_STACK is called to update the TSO structure after it has been
moved from one place to another.
@@ -753,8 +798,8 @@ move_STACK (StgStack *src, StgStack *dest)
{
ptrdiff_t diff;
- // relocate the stack pointer...
- diff = (StgPtr)dest - (StgPtr)src; // In *words*
+ // relocate the stack pointer...
+ diff = (StgPtr)dest - (StgPtr)src; // In *words*
dest->sp = (StgPtr)dest->sp + diff;
}
@@ -818,7 +863,7 @@ allocate (Capability *cap, W_ n)
}
ACQUIRE_SM_LOCK
- bd = allocGroup(req_blocks);
+ bd = allocGroupOnNode(cap->node,req_blocks);
dbl_link_onto(bd, &g0->large_objects);
g0->n_large_blocks += bd->blocks; // might be larger than req_blocks
g0->n_new_large_words += n;
@@ -834,19 +879,19 @@ allocate (Capability *cap, W_ n)
bd = cap->r.rCurrentAlloc;
if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
-
+
if (bd) finishedNurseryBlock(cap,bd);
// The CurrentAlloc block is full, we need to find another
// one. First, we try taking the next block from the
// nursery:
bd = cap->r.rCurrentNursery->link;
-
+
if (bd == NULL) {
// The nursery is empty: allocate a fresh block (we can't
// fail here).
ACQUIRE_SM_LOCK;
- bd = allocBlock();
+ bd = allocBlockOnNode(cap->node);
cap->r.rNursery->n_blocks++;
RELEASE_SM_LOCK;
initBdescr(bd, g0, g0);
@@ -944,7 +989,7 @@ allocatePinned (Capability *cap, W_ n)
}
bd = cap->pinned_object_block;
-
+
// If we don't have a block of pinned objects yet, or the current
// one isn't large enough to hold the new object, get a new one.
if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
@@ -964,24 +1009,13 @@ allocatePinned (Capability *cap, W_ n)
// objects scale really badly if we do this).
//
// So first, we try taking the next block from the nursery, in
- // the same way as allocate(), but note that we can only take
- // an *empty* block, because we're about to mark it as
- // BF_PINNED | BF_LARGE.
+ // the same way as allocate().
bd = cap->r.rCurrentNursery->link;
- if (bd == NULL) { // must be empty!
- // The nursery is empty, or the next block is non-empty:
- // allocate a fresh block (we can't fail here).
-
- // XXX in the case when the next nursery block is
- // non-empty we aren't exerting any pressure to GC soon,
- // so if this case ever happens then we could in theory
- // keep allocating for ever without calling the GC. We
- // can't bump g0->n_new_large_words because that will be
- // counted towards allocation, and we're already counting
- // our pinned obects as allocation in
- // collect_pinned_object_blocks in the GC.
+ if (bd == NULL) {
+ // The nursery is empty: allocate a fresh block (we can't fail
+ // here).
ACQUIRE_SM_LOCK;
- bd = allocBlock();
+ bd = allocBlockOnNode(cap->node);
RELEASE_SM_LOCK;
initBdescr(bd, g0, g0);
} else {
@@ -1217,13 +1251,13 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g)
* that will be collected next time will therefore need twice as many
* blocks since all the data will be copied.
*/
-extern W_
+extern W_
calcNeeded (rtsBool force_major, memcount *blocks_needed)
{
W_ needed = 0, blocks;
uint32_t g, N;
generation *gen;
-
+
if (force_major) {
N = RtsFlags.GcFlags.generations - 1;
} else {
@@ -1238,7 +1272,7 @@ calcNeeded (rtsBool force_major, memcount *blocks_needed)
// we need at least this much space
needed += blocks;
-
+
// are we collecting this gen?
if (g == 0 || // always collect gen 0
blocks > gen->max_blocks)