diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/Capability.c | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/Capability.c')
-rw-r--r-- | rts/Capability.c | 38 |
1 files changed, 23 insertions, 15 deletions
diff --git a/rts/Capability.c b/rts/Capability.c index 1b5f51a92b..411e64dc7a 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -51,7 +51,7 @@ Capability **capabilities = NULL; // an in-call has a chance of quickly finding a free Capability. // Maintaining a global free list of Capabilities would require global // locking, so we don't do that. -static Capability *last_free_capability = NULL; +static Capability *last_free_capability[MAX_NUMA_NODES]; /* * Indicates that the RTS wants to synchronise all the Capabilities @@ -230,11 +230,12 @@ popReturningTask (Capability *cap) * ------------------------------------------------------------------------- */ static void -initCapability( Capability *cap, uint32_t i ) +initCapability (Capability *cap, uint32_t i) { uint32_t g; cap->no = i; + cap->node = capNoToNumaNode(i); cap->in_haskell = rtsFalse; cap->idle = 0; cap->disabled = rtsFalse; @@ -316,9 +317,10 @@ initCapability( Capability *cap, uint32_t i ) * controlled by the user via the RTS flag -N. * * ------------------------------------------------------------------------- */ -void -initCapabilities( void ) +void initCapabilities (void) { + uint32_t i; + /* Declare a couple capability sets representing the process and clock domain. Each capability will get added to these capsets. */ traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess); @@ -328,21 +330,22 @@ initCapabilities( void ) #ifndef REG_Base // We can't support multiple CPUs if BaseReg is not a register - if (RtsFlags.ParFlags.nNodes > 1) { + if (RtsFlags.ParFlags.nCapabilities > 1) { errorBelch("warning: multiple CPUs not supported in this build, reverting to 1"); - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; } #endif n_capabilities = 0; - moreCapabilities(0, RtsFlags.ParFlags.nNodes); - n_capabilities = RtsFlags.ParFlags.nNodes; + moreCapabilities(0, RtsFlags.ParFlags.nCapabilities); + n_capabilities = RtsFlags.ParFlags.nCapabilities; #else /* !THREADED_RTS */ n_capabilities = 1; capabilities = stgMallocBytes(sizeof(Capability*), "initCapabilities"); capabilities[0] = &MainCapability; + initCapability(&MainCapability, 0); #endif @@ -352,7 +355,9 @@ initCapabilities( void ) // There are no free capabilities to begin with. We will start // a worker Task to each Capability, which will quickly put the // Capability on the free list when it finds nothing to do. - last_free_capability = capabilities[0]; + for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) { + last_free_capability[i] = capabilities[0]; + } } void @@ -532,7 +537,7 @@ releaseCapability_ (Capability* cap, #ifdef PROFILING cap->r.rCCCS = CCS_IDLE; #endif - last_free_capability = cap; + last_free_capability[cap->node] = cap; debugTrace(DEBUG_sched, "freeing capability %d", cap->no); } @@ -711,6 +716,7 @@ void waitForCapability (Capability **pCap, Task *task) *pCap = &MainCapability; #else + uint32_t i; Capability *cap = *pCap; if (cap == NULL) { @@ -719,12 +725,14 @@ void waitForCapability (Capability **pCap, Task *task) enabled_capabilities]; } else { // Try last_free_capability first - cap = last_free_capability; + cap = last_free_capability[task->node]; if (cap->running_task) { - uint32_t i; - // otherwise, search for a free capability + // Otherwise, search for a free capability on this node. cap = NULL; - for (i = 0; i < n_capabilities; i++) { + for (i = task->node; i < enabled_capabilities; + i += RtsFlags.GcFlags.nNumaNodes) { + // visits all the capabilities on this node, because + // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes if (!capabilities[i]->running_task) { cap = capabilities[i]; break; @@ -732,7 +740,7 @@ void waitForCapability (Capability **pCap, Task *task) } if (cap == NULL) { // Can't find a free one, use last_free_capability. - cap = last_free_capability; + cap = last_free_capability[task->node]; } } } |