diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/RtsFlags.c | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/RtsFlags.c')
-rw-r--r-- | rts/RtsFlags.c | 118 |
1 files changed, 101 insertions, 17 deletions
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index 1ec5db0bb0..25345bf57b 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -15,6 +15,7 @@ #include "RtsFlags.h" #include "sm/OSMem.h" #include "hooks/Hooks.h" +#include "Capability.h" #ifdef HAVE_CTYPE_H #include <ctype.h> @@ -122,6 +123,7 @@ static void errorRtsOptsDisabled (const char *s); void initRtsFlagsDefaults(void) { + uint32_t i; StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10; // if getPhysicalMemorySize fails just move along with an 8MB limit if (maxStkSize == 0) @@ -157,8 +159,12 @@ void initRtsFlagsDefaults(void) #endif RtsFlags.GcFlags.heapBase = 0; /* means don't care */ RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE; + RtsFlags.GcFlags.numa = rtsFalse; + RtsFlags.GcFlags.nNumaNodes = 1; + for (i = 0; i < MAX_NUMA_NODES; i++) { + RtsFlags.GcFlags.numaMap[i] = 0; + } -#ifdef DEBUG RtsFlags.DebugFlags.scheduler = rtsFalse; RtsFlags.DebugFlags.interpreter = rtsFalse; RtsFlags.DebugFlags.weak = rtsFalse; @@ -174,7 +180,7 @@ void initRtsFlagsDefaults(void) RtsFlags.DebugFlags.squeeze = rtsFalse; RtsFlags.DebugFlags.hpc = rtsFalse; RtsFlags.DebugFlags.sparks = rtsFalse; -#endif + RtsFlags.DebugFlags.numa = rtsFalse; #if defined(PROFILING) RtsFlags.CcFlags.doCostCentres = 0; @@ -220,7 +226,7 @@ void initRtsFlagsDefaults(void) RtsFlags.MiscFlags.linkerMemBase = 0; #ifdef THREADED_RTS - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; RtsFlags.ParFlags.migrate = rtsTrue; RtsFlags.ParFlags.parGcEnabled = 1; RtsFlags.ParFlags.parGcGen = 0; @@ -398,6 +404,14 @@ usage_text[] = { " -qi<n> If a processor has been idle for the last <n> GCs, do not", " wake it up for a non-load-balancing parallel GC.", " (0 disables, default: 0)", +" --numa[=<node_mask>]", +" Use NUMA, nodes given by <node_mask> (default: off)", +#if defined(DEBUG) +" --debug-numa[=<num_nodes>]", +" Pretend NUMA: like --numa, but without the system calls.", +" Can be used on non-NUMA systems for debugging.", +"", +#endif #endif " --install-signal-handlers=<yes|no>", " Install signal handlers (default: yes)", @@ -745,6 +759,76 @@ error = rtsTrue; printRtsInfo(); stg_exit(0); } +#if defined(THREADED_RTS) + else if (!strncmp("numa", &rts_argv[arg][2], 4)) { + OPTION_SAFE; + StgWord mask; + if (rts_argv[arg][6] == '=') { + mask = (StgWord)strtol(rts_argv[arg]+7, + (char **) NULL, 10); + } else { + mask = (StgWord)~0; + } + if (!osNumaAvailable()) { + errorBelch("%s: OS reports NUMA is not available", + rts_argv[arg]); + error = rtsTrue; + break; + } + + uint32_t nNodes = osNumaNodes(); + if (nNodes > MAX_NUMA_NODES) { + errorBelch("%s: Too many NUMA nodes (max %d)", + rts_argv[arg], MAX_NUMA_NODES); + error = rtsTrue; + } else { + RtsFlags.GcFlags.numa = rtsTrue; + mask = mask & osNumaMask(); + uint32_t logical = 0, physical = 0; + for (; physical < MAX_NUMA_NODES; physical++) { + if (mask & 1) { + RtsFlags.GcFlags.numaMap[logical++] = physical; + } + mask = mask >> 1; + } + RtsFlags.GcFlags.nNumaNodes = logical; + if (logical == 0) { + errorBelch("%s: available node set is empty", + rts_argv[arg]); + error = rtsTrue; + } + } + } +#endif +#if defined(DEBUG) && defined(THREADED_RTS) + else if (!strncmp("debug-numa", &rts_argv[arg][2], 10)) { + OPTION_SAFE; + size_t nNodes; + if (rts_argv[arg][12] == '=' && + isdigit(rts_argv[arg][13])) { + nNodes = (StgWord)strtol(rts_argv[arg]+13, + (char **) NULL, 10); + } else { + errorBelch("%s: missing number of nodes", + rts_argv[arg]); + error = rtsTrue; + break; + } + if (nNodes > MAX_NUMA_NODES) { + errorBelch("%s: Too many NUMA nodes (max %d)", + rts_argv[arg], MAX_NUMA_NODES); + error = rtsTrue; + } else { + RtsFlags.GcFlags.numa = rtsTrue; + RtsFlags.DebugFlags.numa = rtsTrue; + RtsFlags.GcFlags.nNumaNodes = nNodes; + uint32_t physical = 0; + for (; physical < MAX_NUMA_NODES; physical++) { + RtsFlags.GcFlags.numaMap[physical] = physical; + } + } + } +#endif else { OPTION_SAFE; errorBelch("unknown RTS option: %s",rts_argv[arg]); @@ -856,20 +940,20 @@ error = rtsTrue; if (strncmp("maxN", &rts_argv[arg][1], 4) == 0) { OPTION_SAFE; THREADED_BUILD_ONLY( - int nNodes; + int nCapabilities; int proc = (int)getNumberOfProcessors(); - nNodes = strtol(rts_argv[arg]+5, (char **) NULL, 10); - if (nNodes > proc) { nNodes = proc; } + nCapabilities = strtol(rts_argv[arg]+5, (char **) NULL, 10); + if (nCapabilities > proc) { nCapabilities = proc; } - if (nNodes <= 0) { + if (nCapabilities <= 0) { errorBelch("bad value for -maxN"); error = rtsTrue; } #if defined(PROFILING) - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; #else - RtsFlags.ParFlags.nNodes = (uint32_t)nNodes; + RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities; #endif ) break; } else { @@ -1071,26 +1155,26 @@ error = rtsTrue; THREADED_BUILD_ONLY( if (rts_argv[arg][2] == '\0') { #if defined(PROFILING) - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; #else - RtsFlags.ParFlags.nNodes = getNumberOfProcessors(); + RtsFlags.ParFlags.nCapabilities = getNumberOfProcessors(); #endif } else { - int nNodes; + int nCapabilities; OPTION_SAFE; /* but see extra checks below... */ - nNodes = strtol(rts_argv[arg]+2, (char **) NULL, 10); + nCapabilities = strtol(rts_argv[arg]+2, (char **) NULL, 10); - if (nNodes <= 0) { + if (nCapabilities <= 0) { errorBelch("bad value for -N"); error = rtsTrue; } if (rtsOptsEnabled == RtsOptsSafeOnly && - nNodes > (int)getNumberOfProcessors()) { + nCapabilities > (int)getNumberOfProcessors()) { errorRtsOptsDisabled("Using large values for -N is not allowed by default. %s"); stg_exit(EXIT_FAILURE); } - RtsFlags.ParFlags.nNodes = (uint32_t)nNodes; + RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities; } ) break; @@ -1395,7 +1479,7 @@ static void normaliseRtsOpts (void) } #ifdef THREADED_RTS - if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nNodes) { + if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) { errorBelch("GC threads (-qn) must be between 1 and the value of -N"); errorUsage(); } |