diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/SMPClosureOps.h | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/SMPClosureOps.h')
-rw-r--r-- | rts/SMPClosureOps.h | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h new file mode 100644 index 0000000000..39cde45544 --- /dev/null +++ b/rts/SMPClosureOps.h @@ -0,0 +1,129 @@ +/* ---------------------------------------------------------------------------- + * + * (c) The GHC Team, 2005-2013 + * + * Macros for THREADED_RTS support + * + * -------------------------------------------------------------------------- */ + +#ifndef RTS_STORAGE_SMPCLOSUREOPS_H +#define RTS_STORAGE_SMPCLOSUREOPS_H + +#include "BeginPrivate.h" + +#ifdef CMINUSMINUS + +/* Lock closure, equivalent to ccall lockClosure but the condition is inlined. + * Arguments are swapped for uniformity with unlockClosure. */ +#if defined(THREADED_RTS) +#define LOCK_CLOSURE(closure, info) \ + if (CInt[n_capabilities] == 1 :: CInt) { \ + info = GET_INFO(closure); \ + } else { \ + ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \ + } +#else +#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure) +#endif + +#define unlockClosure(ptr,info) \ + prim_write_barrier; \ + StgHeader_info(ptr) = info; + +#else + +INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p); +EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p); +EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p); +EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info); + +#if defined(THREADED_RTS) + +/* ----------------------------------------------------------------------------- + * Locking/unlocking closures + * + * This is used primarily in the implementation of MVars. + * -------------------------------------------------------------------------- */ + +// We want a callable copy of reallyLockClosure() so that we can refer to it +// from .cmm files compiled using the native codegen, so these are given +// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've +// kept it INLINE_HEADER. +EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p) +{ + StgWord info; + do { + uint32_t i = 0; + do { + info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); + if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info; + } while (++i < SPIN_COUNT); + yieldThread(); + } while (1); +} + +INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p) +{ + if (n_capabilities == 1) { + return (StgInfoTable *)p->header.info; + } + else { + return reallyLockClosure(p); + } +} + +// ToDo: consider splitting tryLockClosure into reallyTryLockClosure, +// same as lockClosure +EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p) +{ + StgWord info; + if (n_capabilities == 1) { + return (StgInfoTable *)p->header.info; + } + else { + info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); + if (info != (W_)&stg_WHITEHOLE_info) { + return (StgInfoTable *)info; + } else { + return NULL; + } + } +} + +#else /* !THREADED_RTS */ + +EXTERN_INLINE StgInfoTable * +reallyLockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +INLINE_HEADER StgInfoTable * +lockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +EXTERN_INLINE StgInfoTable * +tryLockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +#endif /* THREADED_RTS */ + +EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info) +{ + // This is a strictly ordered write, so we need a write_barrier(): + write_barrier(); + p->header.info = info; +} + +// Handy specialised versions of lockClosure()/unlockClosure() +INLINE_HEADER void lockTSO(StgTSO *tso); +INLINE_HEADER void lockTSO(StgTSO *tso) +{ lockClosure((StgClosure *)tso); } + +INLINE_HEADER void unlockTSO(StgTSO *tso); +INLINE_HEADER void unlockTSO(StgTSO *tso) +{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); } + +#endif /* CMINUSMINUS */ + +#include "EndPrivate.h" + +#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */ |