summaryrefslogtreecommitdiff
path: root/rts/Task.c
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/Task.c
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/Task.c')
-rw-r--r--rts/Task.c26
1 files changed, 22 insertions, 4 deletions
diff --git a/rts/Task.c b/rts/Task.c
index 45ef77ba0b..9a827745ba 100644
--- a/rts/Task.c
+++ b/rts/Task.c
@@ -220,6 +220,7 @@ newTask (rtsBool worker)
initCondition(&task->cond);
initMutex(&task->lock);
task->wakeup = rtsFalse;
+ task->node = 0;
#endif
task->next = NULL;
@@ -427,6 +428,9 @@ workerStart(Task *task)
if (RtsFlags.ParFlags.setAffinity) {
setThreadAffinity(cap->no, n_capabilities);
}
+ if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) {
+ setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+ }
// set the thread-local pointer to the Task:
setMyTask(task);
@@ -457,6 +461,7 @@ startWorkerTask (Capability *cap)
// We don't emit a task creation event here, but in workerStart,
// where the kernel thread id is known.
task->cap = cap;
+ task->node = cap->node;
// Give the capability directly to the worker; we can't let anyone
// else get in, because the new worker Task has nowhere to go to
@@ -490,13 +495,27 @@ interruptWorkerTask (Task *task)
#endif /* THREADED_RTS */
-void
-setInCallCapability (int preferred_capability)
+void rts_setInCallCapability (
+ int preferred_capability,
+ int affinity USED_IF_THREADS)
{
Task *task = allocTask();
task->preferred_capability = preferred_capability;
-}
+#ifdef THREADED_RTS
+ if (affinity) {
+ if (RtsFlags.ParFlags.setAffinity) {
+ setThreadAffinity(preferred_capability, n_capabilities);
+ }
+ if (RtsFlags.GcFlags.numa) {
+ task->node = capNoToNumaNode(preferred_capability);
+ if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA
+ setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+ }
+ }
+ }
+#endif
+}
#ifdef DEBUG
@@ -525,4 +544,3 @@ printAllTasks(void)
}
#endif
-