diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /includes | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'includes')
-rw-r--r-- | includes/Cmm.h | 1 | ||||
-rw-r--r-- | includes/Rts.h | 1 | ||||
-rw-r--r-- | includes/RtsAPI.h | 6 | ||||
-rw-r--r-- | includes/rts/Constants.h | 6 | ||||
-rw-r--r-- | includes/rts/Flags.h | 8 | ||||
-rw-r--r-- | includes/rts/OSThreads.h | 4 | ||||
-rw-r--r-- | includes/rts/Threads.h | 4 | ||||
-rw-r--r-- | includes/rts/storage/Block.h | 20 | ||||
-rw-r--r-- | includes/rts/storage/MBlock.h | 2 | ||||
-rw-r--r-- | includes/rts/storage/SMPClosureOps.h | 125 |
10 files changed, 44 insertions, 133 deletions
diff --git a/includes/Cmm.h b/includes/Cmm.h index cbd7e36ac1..3b9a5a6794 100644 --- a/includes/Cmm.h +++ b/includes/Cmm.h @@ -325,7 +325,6 @@ #include "DerivedConstants.h" #include "rts/storage/ClosureTypes.h" #include "rts/storage/FunTypes.h" -#include "rts/storage/SMPClosureOps.h" #include "rts/OSThreads.h" /* diff --git a/includes/Rts.h b/includes/Rts.h index 1ad1bba5f8..3d4538f41d 100644 --- a/includes/Rts.h +++ b/includes/Rts.h @@ -203,7 +203,6 @@ INLINE_HEADER Time fsecondsToTime (double t) #include "rts/storage/ClosureTypes.h" #include "rts/storage/TSO.h" #include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */ -#include "rts/storage/SMPClosureOps.h" #include "rts/storage/Block.h" #include "rts/storage/ClosureMacros.h" #include "rts/storage/MBlock.h" diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h index a4a094fd14..230c982c12 100644 --- a/includes/RtsAPI.h +++ b/includes/RtsAPI.h @@ -179,7 +179,11 @@ Capability *rts_unsafeGetMyCapability (void); // Note that the thread may still be migrated by the RTS scheduler, but that // will only happen if there are multiple threads running on one Capability and // another Capability is free. -void setInCallCapability (int preferred_capability); +// +// If affinity is non-zero, the current thread will be bound to +// specific CPUs according to the prevailing affinity policy for the +// specified capability, set by either +RTS -qa or +RTS --numa. +void rts_setInCallCapability (int preferred_capability, int affinity); /* ---------------------------------------------------------------------------- Building Haskell objects from C datatypes. diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h index b65b8d3a9d..114f30ce2f 100644 --- a/includes/rts/Constants.h +++ b/includes/rts/Constants.h @@ -295,4 +295,10 @@ #define MAX_SPARE_WORKERS 6 +/* + * The maximum number of NUMA nodes we support. This is a fixed limit so that + * we can have static arrays of this size in the RTS for speed. + */ +#define MAX_NUMA_NODES 16 + #endif /* RTS_CONSTANTS_H */ diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index 8020a177b0..ff303dc5e6 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -73,6 +73,11 @@ typedef struct _GC_FLAGS { * to handle the exception before we * raise it again. */ + + rtsBool numa; /* Use NUMA */ + uint32_t nNumaNodes; /* Number of nodes */ + uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS + * node numbers */ } GC_FLAGS; /* See Note [Synchronization of flags and base APIs] */ @@ -93,6 +98,7 @@ typedef struct _DEBUG_FLAGS { rtsBool squeeze; /* 'z' stack squeezing & lazy blackholing */ rtsBool hpc; /* 'c' coverage */ rtsBool sparks; /* 'r' */ + rtsBool numa; /* '--debug-numa' */ } DEBUG_FLAGS; /* See Note [Synchronization of flags and base APIs] */ @@ -184,7 +190,7 @@ typedef struct _MISC_FLAGS { #ifdef THREADED_RTS /* See Note [Synchronization of flags and base APIs] */ typedef struct _PAR_FLAGS { - uint32_t nNodes; /* number of threads to run simultaneously */ + uint32_t nCapabilities; /* number of threads to run simultaneously */ rtsBool migrate; /* migrate threads between capabilities */ uint32_t maxLocalSparks; rtsBool parGcEnabled; /* enable parallel GC */ diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h index ee1855b4da..bc84b714db 100644 --- a/includes/rts/OSThreads.h +++ b/includes/rts/OSThreads.h @@ -200,7 +200,9 @@ void setThreadLocalVar (ThreadLocalKey *key, void *value); void freeThreadLocalKey (ThreadLocalKey *key); // Processors and affinity -void setThreadAffinity (uint32_t n, uint32_t m); +void setThreadAffinity (uint32_t n, uint32_t m); +void setThreadNode (uint32_t node); +void releaseThreadNode (void); #endif // !CMINUSMINUS #else diff --git a/includes/rts/Threads.h b/includes/rts/Threads.h index 67d01db7d4..866c4692bd 100644 --- a/includes/rts/Threads.h +++ b/includes/rts/Threads.h @@ -58,7 +58,9 @@ pid_t forkProcess (HsStablePtr *entry) HsBool rtsSupportsBoundThreads (void); -// The number of Capabilities +// The number of Capabilities. +// ToDo: I would like this to be private to the RTS and instead expose a +// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c extern unsigned int n_capabilities; // The number of Capabilities that are not disabled diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h index 1a31de5512..e04cfdfec6 100644 --- a/includes/rts/storage/Block.h +++ b/includes/rts/storage/Block.h @@ -111,7 +111,7 @@ typedef struct bdescr_ { StgWord16 gen_no; // gen->no, cached StgWord16 dest_no; // number of destination generation - StgWord16 _pad1; + StgWord16 node; // which memory node does this block live on? StgWord16 flags; // block flags, see below @@ -280,12 +280,28 @@ extern void initBlockAllocator(void); /* Allocation -------------------------------------------------------------- */ bdescr *allocGroup(W_ n); -bdescr *allocBlock(void); + +EXTERN_INLINE bdescr* allocBlock(void); +EXTERN_INLINE bdescr* allocBlock(void) +{ + return allocGroup(1); +} + +bdescr *allocGroupOnNode(uint32_t node, W_ n); + +EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node); +EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node) +{ + return allocGroupOnNode(node,1); +} // versions that take the storage manager lock for you: bdescr *allocGroup_lock(W_ n); bdescr *allocBlock_lock(void); +bdescr *allocGroupOnNode_lock(uint32_t node, W_ n); +bdescr *allocBlockOnNode_lock(uint32_t node); + /* De-Allocation ----------------------------------------------------------- */ void freeGroup(bdescr *p); diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h index 419a96e225..a8251c8c0b 100644 --- a/includes/rts/storage/MBlock.h +++ b/includes/rts/storage/MBlock.h @@ -18,6 +18,8 @@ extern W_ mblocks_allocated; extern void initMBlocks(void); extern void * getMBlock(void); extern void * getMBlocks(uint32_t n); +extern void * getMBlockOnNode(uint32_t node); +extern void * getMBlocksOnNode(uint32_t node, uint32_t n); extern void freeMBlocks(void *addr, uint32_t n); extern void releaseFreeMemory(void); extern void freeAllMBlocks(void); diff --git a/includes/rts/storage/SMPClosureOps.h b/includes/rts/storage/SMPClosureOps.h deleted file mode 100644 index ee92186f5f..0000000000 --- a/includes/rts/storage/SMPClosureOps.h +++ /dev/null @@ -1,125 +0,0 @@ -/* ---------------------------------------------------------------------------- - * - * (c) The GHC Team, 2005-2013 - * - * Macros for THREADED_RTS support - * - * -------------------------------------------------------------------------- */ - -#ifndef RTS_STORAGE_SMPCLOSUREOPS_H -#define RTS_STORAGE_SMPCLOSUREOPS_H - -#ifdef CMINUSMINUS - -/* Lock closure, equivalent to ccall lockClosure but the condition is inlined. - * Arguments are swapped for uniformity with unlockClosure. */ -#if defined(THREADED_RTS) -#define LOCK_CLOSURE(closure, info) \ - if (CInt[n_capabilities] == 1 :: CInt) { \ - info = GET_INFO(closure); \ - } else { \ - ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \ - } -#else -#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure) -#endif - -#define unlockClosure(ptr,info) \ - prim_write_barrier; \ - StgHeader_info(ptr) = info; - -#else - -INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p); -EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p); -EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p); -EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info); - -#if defined(THREADED_RTS) - -/* ----------------------------------------------------------------------------- - * Locking/unlocking closures - * - * This is used primarily in the implementation of MVars. - * -------------------------------------------------------------------------- */ - -// We want a callable copy of reallyLockClosure() so that we can refer to it -// from .cmm files compiled using the native codegen, so these are given -// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've -// kept it INLINE_HEADER. -EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p) -{ - StgWord info; - do { - uint32_t i = 0; - do { - info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); - if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info; - } while (++i < SPIN_COUNT); - yieldThread(); - } while (1); -} - -INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p) -{ - if (n_capabilities == 1) { - return (StgInfoTable *)p->header.info; - } - else { - return reallyLockClosure(p); - } -} - -// ToDo: consider splitting tryLockClosure into reallyTryLockClosure, -// same as lockClosure -EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p) -{ - StgWord info; - if (n_capabilities == 1) { - return (StgInfoTable *)p->header.info; - } - else { - info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); - if (info != (W_)&stg_WHITEHOLE_info) { - return (StgInfoTable *)info; - } else { - return NULL; - } - } -} - -#else /* !THREADED_RTS */ - -EXTERN_INLINE StgInfoTable * -reallyLockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -INLINE_HEADER StgInfoTable * -lockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -EXTERN_INLINE StgInfoTable * -tryLockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -#endif /* THREADED_RTS */ - -EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info) -{ - // This is a strictly ordered write, so we need a write_barrier(): - write_barrier(); - p->header.info = info; -} - -// Handy specialised versions of lockClosure()/unlockClosure() -INLINE_HEADER void lockTSO(StgTSO *tso); -INLINE_HEADER void lockTSO(StgTSO *tso) -{ lockClosure((StgClosure *)tso); } - -INLINE_HEADER void unlockTSO(StgTSO *tso); -INLINE_HEADER void unlockTSO(StgTSO *tso) -{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); } - -#endif /* CMINUSMINUS */ - -#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */ |