summaryrefslogtreecommitdiff
path: root/rts/Capability.c
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/Capability.c
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/Capability.c')
-rw-r--r--rts/Capability.c38
1 files changed, 23 insertions, 15 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index 1b5f51a92b..411e64dc7a 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -51,7 +51,7 @@ Capability **capabilities = NULL;
// an in-call has a chance of quickly finding a free Capability.
// Maintaining a global free list of Capabilities would require global
// locking, so we don't do that.
-static Capability *last_free_capability = NULL;
+static Capability *last_free_capability[MAX_NUMA_NODES];
/*
* Indicates that the RTS wants to synchronise all the Capabilities
@@ -230,11 +230,12 @@ popReturningTask (Capability *cap)
* ------------------------------------------------------------------------- */
static void
-initCapability( Capability *cap, uint32_t i )
+initCapability (Capability *cap, uint32_t i)
{
uint32_t g;
cap->no = i;
+ cap->node = capNoToNumaNode(i);
cap->in_haskell = rtsFalse;
cap->idle = 0;
cap->disabled = rtsFalse;
@@ -316,9 +317,10 @@ initCapability( Capability *cap, uint32_t i )
* controlled by the user via the RTS flag -N.
*
* ------------------------------------------------------------------------- */
-void
-initCapabilities( void )
+void initCapabilities (void)
{
+ uint32_t i;
+
/* Declare a couple capability sets representing the process and
clock domain. Each capability will get added to these capsets. */
traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
@@ -328,21 +330,22 @@ initCapabilities( void )
#ifndef REG_Base
// We can't support multiple CPUs if BaseReg is not a register
- if (RtsFlags.ParFlags.nNodes > 1) {
+ if (RtsFlags.ParFlags.nCapabilities > 1) {
errorBelch("warning: multiple CPUs not supported in this build, reverting to 1");
- RtsFlags.ParFlags.nNodes = 1;
+ RtsFlags.ParFlags.nCapabilities = 1;
}
#endif
n_capabilities = 0;
- moreCapabilities(0, RtsFlags.ParFlags.nNodes);
- n_capabilities = RtsFlags.ParFlags.nNodes;
+ moreCapabilities(0, RtsFlags.ParFlags.nCapabilities);
+ n_capabilities = RtsFlags.ParFlags.nCapabilities;
#else /* !THREADED_RTS */
n_capabilities = 1;
capabilities = stgMallocBytes(sizeof(Capability*), "initCapabilities");
capabilities[0] = &MainCapability;
+
initCapability(&MainCapability, 0);
#endif
@@ -352,7 +355,9 @@ initCapabilities( void )
// There are no free capabilities to begin with. We will start
// a worker Task to each Capability, which will quickly put the
// Capability on the free list when it finds nothing to do.
- last_free_capability = capabilities[0];
+ for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+ last_free_capability[i] = capabilities[0];
+ }
}
void
@@ -532,7 +537,7 @@ releaseCapability_ (Capability* cap,
#ifdef PROFILING
cap->r.rCCCS = CCS_IDLE;
#endif
- last_free_capability = cap;
+ last_free_capability[cap->node] = cap;
debugTrace(DEBUG_sched, "freeing capability %d", cap->no);
}
@@ -711,6 +716,7 @@ void waitForCapability (Capability **pCap, Task *task)
*pCap = &MainCapability;
#else
+ uint32_t i;
Capability *cap = *pCap;
if (cap == NULL) {
@@ -719,12 +725,14 @@ void waitForCapability (Capability **pCap, Task *task)
enabled_capabilities];
} else {
// Try last_free_capability first
- cap = last_free_capability;
+ cap = last_free_capability[task->node];
if (cap->running_task) {
- uint32_t i;
- // otherwise, search for a free capability
+ // Otherwise, search for a free capability on this node.
cap = NULL;
- for (i = 0; i < n_capabilities; i++) {
+ for (i = task->node; i < enabled_capabilities;
+ i += RtsFlags.GcFlags.nNumaNodes) {
+ // visits all the capabilities on this node, because
+ // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
if (!capabilities[i]->running_task) {
cap = capabilities[i];
break;
@@ -732,7 +740,7 @@ void waitForCapability (Capability **pCap, Task *task)
}
if (cap == NULL) {
// Can't find a free one, use last_free_capability.
- cap = last_free_capability;
+ cap = last_free_capability[task->node];
}
}
}