summaryrefslogtreecommitdiff
path: root/rts/RtsFlags.c
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/RtsFlags.c
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/RtsFlags.c')
-rw-r--r--rts/RtsFlags.c118
1 files changed, 101 insertions, 17 deletions
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 1ec5db0bb0..25345bf57b 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -15,6 +15,7 @@
#include "RtsFlags.h"
#include "sm/OSMem.h"
#include "hooks/Hooks.h"
+#include "Capability.h"
#ifdef HAVE_CTYPE_H
#include <ctype.h>
@@ -122,6 +123,7 @@ static void errorRtsOptsDisabled (const char *s);
void initRtsFlagsDefaults(void)
{
+ uint32_t i;
StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
// if getPhysicalMemorySize fails just move along with an 8MB limit
if (maxStkSize == 0)
@@ -157,8 +159,12 @@ void initRtsFlagsDefaults(void)
#endif
RtsFlags.GcFlags.heapBase = 0; /* means don't care */
RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE;
+ RtsFlags.GcFlags.numa = rtsFalse;
+ RtsFlags.GcFlags.nNumaNodes = 1;
+ for (i = 0; i < MAX_NUMA_NODES; i++) {
+ RtsFlags.GcFlags.numaMap[i] = 0;
+ }
-#ifdef DEBUG
RtsFlags.DebugFlags.scheduler = rtsFalse;
RtsFlags.DebugFlags.interpreter = rtsFalse;
RtsFlags.DebugFlags.weak = rtsFalse;
@@ -174,7 +180,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.DebugFlags.squeeze = rtsFalse;
RtsFlags.DebugFlags.hpc = rtsFalse;
RtsFlags.DebugFlags.sparks = rtsFalse;
-#endif
+ RtsFlags.DebugFlags.numa = rtsFalse;
#if defined(PROFILING)
RtsFlags.CcFlags.doCostCentres = 0;
@@ -220,7 +226,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.MiscFlags.linkerMemBase = 0;
#ifdef THREADED_RTS
- RtsFlags.ParFlags.nNodes = 1;
+ RtsFlags.ParFlags.nCapabilities = 1;
RtsFlags.ParFlags.migrate = rtsTrue;
RtsFlags.ParFlags.parGcEnabled = 1;
RtsFlags.ParFlags.parGcGen = 0;
@@ -398,6 +404,14 @@ usage_text[] = {
" -qi<n> If a processor has been idle for the last <n> GCs, do not",
" wake it up for a non-load-balancing parallel GC.",
" (0 disables, default: 0)",
+" --numa[=<node_mask>]",
+" Use NUMA, nodes given by <node_mask> (default: off)",
+#if defined(DEBUG)
+" --debug-numa[=<num_nodes>]",
+" Pretend NUMA: like --numa, but without the system calls.",
+" Can be used on non-NUMA systems for debugging.",
+"",
+#endif
#endif
" --install-signal-handlers=<yes|no>",
" Install signal handlers (default: yes)",
@@ -745,6 +759,76 @@ error = rtsTrue;
printRtsInfo();
stg_exit(0);
}
+#if defined(THREADED_RTS)
+ else if (!strncmp("numa", &rts_argv[arg][2], 4)) {
+ OPTION_SAFE;
+ StgWord mask;
+ if (rts_argv[arg][6] == '=') {
+ mask = (StgWord)strtol(rts_argv[arg]+7,
+ (char **) NULL, 10);
+ } else {
+ mask = (StgWord)~0;
+ }
+ if (!osNumaAvailable()) {
+ errorBelch("%s: OS reports NUMA is not available",
+ rts_argv[arg]);
+ error = rtsTrue;
+ break;
+ }
+
+ uint32_t nNodes = osNumaNodes();
+ if (nNodes > MAX_NUMA_NODES) {
+ errorBelch("%s: Too many NUMA nodes (max %d)",
+ rts_argv[arg], MAX_NUMA_NODES);
+ error = rtsTrue;
+ } else {
+ RtsFlags.GcFlags.numa = rtsTrue;
+ mask = mask & osNumaMask();
+ uint32_t logical = 0, physical = 0;
+ for (; physical < MAX_NUMA_NODES; physical++) {
+ if (mask & 1) {
+ RtsFlags.GcFlags.numaMap[logical++] = physical;
+ }
+ mask = mask >> 1;
+ }
+ RtsFlags.GcFlags.nNumaNodes = logical;
+ if (logical == 0) {
+ errorBelch("%s: available node set is empty",
+ rts_argv[arg]);
+ error = rtsTrue;
+ }
+ }
+ }
+#endif
+#if defined(DEBUG) && defined(THREADED_RTS)
+ else if (!strncmp("debug-numa", &rts_argv[arg][2], 10)) {
+ OPTION_SAFE;
+ size_t nNodes;
+ if (rts_argv[arg][12] == '=' &&
+ isdigit(rts_argv[arg][13])) {
+ nNodes = (StgWord)strtol(rts_argv[arg]+13,
+ (char **) NULL, 10);
+ } else {
+ errorBelch("%s: missing number of nodes",
+ rts_argv[arg]);
+ error = rtsTrue;
+ break;
+ }
+ if (nNodes > MAX_NUMA_NODES) {
+ errorBelch("%s: Too many NUMA nodes (max %d)",
+ rts_argv[arg], MAX_NUMA_NODES);
+ error = rtsTrue;
+ } else {
+ RtsFlags.GcFlags.numa = rtsTrue;
+ RtsFlags.DebugFlags.numa = rtsTrue;
+ RtsFlags.GcFlags.nNumaNodes = nNodes;
+ uint32_t physical = 0;
+ for (; physical < MAX_NUMA_NODES; physical++) {
+ RtsFlags.GcFlags.numaMap[physical] = physical;
+ }
+ }
+ }
+#endif
else {
OPTION_SAFE;
errorBelch("unknown RTS option: %s",rts_argv[arg]);
@@ -856,20 +940,20 @@ error = rtsTrue;
if (strncmp("maxN", &rts_argv[arg][1], 4) == 0) {
OPTION_SAFE;
THREADED_BUILD_ONLY(
- int nNodes;
+ int nCapabilities;
int proc = (int)getNumberOfProcessors();
- nNodes = strtol(rts_argv[arg]+5, (char **) NULL, 10);
- if (nNodes > proc) { nNodes = proc; }
+ nCapabilities = strtol(rts_argv[arg]+5, (char **) NULL, 10);
+ if (nCapabilities > proc) { nCapabilities = proc; }
- if (nNodes <= 0) {
+ if (nCapabilities <= 0) {
errorBelch("bad value for -maxN");
error = rtsTrue;
}
#if defined(PROFILING)
- RtsFlags.ParFlags.nNodes = 1;
+ RtsFlags.ParFlags.nCapabilities = 1;
#else
- RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
+ RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
#endif
) break;
} else {
@@ -1071,26 +1155,26 @@ error = rtsTrue;
THREADED_BUILD_ONLY(
if (rts_argv[arg][2] == '\0') {
#if defined(PROFILING)
- RtsFlags.ParFlags.nNodes = 1;
+ RtsFlags.ParFlags.nCapabilities = 1;
#else
- RtsFlags.ParFlags.nNodes = getNumberOfProcessors();
+ RtsFlags.ParFlags.nCapabilities = getNumberOfProcessors();
#endif
} else {
- int nNodes;
+ int nCapabilities;
OPTION_SAFE; /* but see extra checks below... */
- nNodes = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+ nCapabilities = strtol(rts_argv[arg]+2, (char **) NULL, 10);
- if (nNodes <= 0) {
+ if (nCapabilities <= 0) {
errorBelch("bad value for -N");
error = rtsTrue;
}
if (rtsOptsEnabled == RtsOptsSafeOnly &&
- nNodes > (int)getNumberOfProcessors()) {
+ nCapabilities > (int)getNumberOfProcessors()) {
errorRtsOptsDisabled("Using large values for -N is not allowed by default. %s");
stg_exit(EXIT_FAILURE);
}
- RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
+ RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
}
) break;
@@ -1395,7 +1479,7 @@ static void normaliseRtsOpts (void)
}
#ifdef THREADED_RTS
- if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nNodes) {
+ if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) {
errorBelch("GC threads (-qn) must be between 1 and the value of -N");
errorUsage();
}