summaryrefslogtreecommitdiff
path: root/includes
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /includes
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'includes')
-rw-r--r--includes/Cmm.h1
-rw-r--r--includes/Rts.h1
-rw-r--r--includes/RtsAPI.h6
-rw-r--r--includes/rts/Constants.h6
-rw-r--r--includes/rts/Flags.h8
-rw-r--r--includes/rts/OSThreads.h4
-rw-r--r--includes/rts/Threads.h4
-rw-r--r--includes/rts/storage/Block.h20
-rw-r--r--includes/rts/storage/MBlock.h2
-rw-r--r--includes/rts/storage/SMPClosureOps.h125
10 files changed, 44 insertions, 133 deletions
diff --git a/includes/Cmm.h b/includes/Cmm.h
index cbd7e36ac1..3b9a5a6794 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -325,7 +325,6 @@
#include "DerivedConstants.h"
#include "rts/storage/ClosureTypes.h"
#include "rts/storage/FunTypes.h"
-#include "rts/storage/SMPClosureOps.h"
#include "rts/OSThreads.h"
/*
diff --git a/includes/Rts.h b/includes/Rts.h
index 1ad1bba5f8..3d4538f41d 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -203,7 +203,6 @@ INLINE_HEADER Time fsecondsToTime (double t)
#include "rts/storage/ClosureTypes.h"
#include "rts/storage/TSO.h"
#include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */
-#include "rts/storage/SMPClosureOps.h"
#include "rts/storage/Block.h"
#include "rts/storage/ClosureMacros.h"
#include "rts/storage/MBlock.h"
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index a4a094fd14..230c982c12 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -179,7 +179,11 @@ Capability *rts_unsafeGetMyCapability (void);
// Note that the thread may still be migrated by the RTS scheduler, but that
// will only happen if there are multiple threads running on one Capability and
// another Capability is free.
-void setInCallCapability (int preferred_capability);
+//
+// If affinity is non-zero, the current thread will be bound to
+// specific CPUs according to the prevailing affinity policy for the
+// specified capability, set by either +RTS -qa or +RTS --numa.
+void rts_setInCallCapability (int preferred_capability, int affinity);
/* ----------------------------------------------------------------------------
Building Haskell objects from C datatypes.
diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h
index b65b8d3a9d..114f30ce2f 100644
--- a/includes/rts/Constants.h
+++ b/includes/rts/Constants.h
@@ -295,4 +295,10 @@
#define MAX_SPARE_WORKERS 6
+/*
+ * The maximum number of NUMA nodes we support. This is a fixed limit so that
+ * we can have static arrays of this size in the RTS for speed.
+ */
+#define MAX_NUMA_NODES 16
+
#endif /* RTS_CONSTANTS_H */
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index 8020a177b0..ff303dc5e6 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -73,6 +73,11 @@ typedef struct _GC_FLAGS {
* to handle the exception before we
* raise it again.
*/
+
+ rtsBool numa; /* Use NUMA */
+ uint32_t nNumaNodes; /* Number of nodes */
+ uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS
+ * node numbers */
} GC_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
@@ -93,6 +98,7 @@ typedef struct _DEBUG_FLAGS {
rtsBool squeeze; /* 'z' stack squeezing & lazy blackholing */
rtsBool hpc; /* 'c' coverage */
rtsBool sparks; /* 'r' */
+ rtsBool numa; /* '--debug-numa' */
} DEBUG_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
@@ -184,7 +190,7 @@ typedef struct _MISC_FLAGS {
#ifdef THREADED_RTS
/* See Note [Synchronization of flags and base APIs] */
typedef struct _PAR_FLAGS {
- uint32_t nNodes; /* number of threads to run simultaneously */
+ uint32_t nCapabilities; /* number of threads to run simultaneously */
rtsBool migrate; /* migrate threads between capabilities */
uint32_t maxLocalSparks;
rtsBool parGcEnabled; /* enable parallel GC */
diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h
index ee1855b4da..bc84b714db 100644
--- a/includes/rts/OSThreads.h
+++ b/includes/rts/OSThreads.h
@@ -200,7 +200,9 @@ void setThreadLocalVar (ThreadLocalKey *key, void *value);
void freeThreadLocalKey (ThreadLocalKey *key);
// Processors and affinity
-void setThreadAffinity (uint32_t n, uint32_t m);
+void setThreadAffinity (uint32_t n, uint32_t m);
+void setThreadNode (uint32_t node);
+void releaseThreadNode (void);
#endif // !CMINUSMINUS
#else
diff --git a/includes/rts/Threads.h b/includes/rts/Threads.h
index 67d01db7d4..866c4692bd 100644
--- a/includes/rts/Threads.h
+++ b/includes/rts/Threads.h
@@ -58,7 +58,9 @@ pid_t forkProcess (HsStablePtr *entry)
HsBool rtsSupportsBoundThreads (void);
-// The number of Capabilities
+// The number of Capabilities.
+// ToDo: I would like this to be private to the RTS and instead expose a
+// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c
extern unsigned int n_capabilities;
// The number of Capabilities that are not disabled
diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h
index 1a31de5512..e04cfdfec6 100644
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -111,7 +111,7 @@ typedef struct bdescr_ {
StgWord16 gen_no; // gen->no, cached
StgWord16 dest_no; // number of destination generation
- StgWord16 _pad1;
+ StgWord16 node; // which memory node does this block live on?
StgWord16 flags; // block flags, see below
@@ -280,12 +280,28 @@ extern void initBlockAllocator(void);
/* Allocation -------------------------------------------------------------- */
bdescr *allocGroup(W_ n);
-bdescr *allocBlock(void);
+
+EXTERN_INLINE bdescr* allocBlock(void);
+EXTERN_INLINE bdescr* allocBlock(void)
+{
+ return allocGroup(1);
+}
+
+bdescr *allocGroupOnNode(uint32_t node, W_ n);
+
+EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node);
+EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node)
+{
+ return allocGroupOnNode(node,1);
+}
// versions that take the storage manager lock for you:
bdescr *allocGroup_lock(W_ n);
bdescr *allocBlock_lock(void);
+bdescr *allocGroupOnNode_lock(uint32_t node, W_ n);
+bdescr *allocBlockOnNode_lock(uint32_t node);
+
/* De-Allocation ----------------------------------------------------------- */
void freeGroup(bdescr *p);
diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h
index 419a96e225..a8251c8c0b 100644
--- a/includes/rts/storage/MBlock.h
+++ b/includes/rts/storage/MBlock.h
@@ -18,6 +18,8 @@ extern W_ mblocks_allocated;
extern void initMBlocks(void);
extern void * getMBlock(void);
extern void * getMBlocks(uint32_t n);
+extern void * getMBlockOnNode(uint32_t node);
+extern void * getMBlocksOnNode(uint32_t node, uint32_t n);
extern void freeMBlocks(void *addr, uint32_t n);
extern void releaseFreeMemory(void);
extern void freeAllMBlocks(void);
diff --git a/includes/rts/storage/SMPClosureOps.h b/includes/rts/storage/SMPClosureOps.h
deleted file mode 100644
index ee92186f5f..0000000000
--- a/includes/rts/storage/SMPClosureOps.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* ----------------------------------------------------------------------------
- *
- * (c) The GHC Team, 2005-2013
- *
- * Macros for THREADED_RTS support
- *
- * -------------------------------------------------------------------------- */
-
-#ifndef RTS_STORAGE_SMPCLOSUREOPS_H
-#define RTS_STORAGE_SMPCLOSUREOPS_H
-
-#ifdef CMINUSMINUS
-
-/* Lock closure, equivalent to ccall lockClosure but the condition is inlined.
- * Arguments are swapped for uniformity with unlockClosure. */
-#if defined(THREADED_RTS)
-#define LOCK_CLOSURE(closure, info) \
- if (CInt[n_capabilities] == 1 :: CInt) { \
- info = GET_INFO(closure); \
- } else { \
- ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \
- }
-#else
-#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure)
-#endif
-
-#define unlockClosure(ptr,info) \
- prim_write_barrier; \
- StgHeader_info(ptr) = info;
-
-#else
-
-INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p);
-EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p);
-EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p);
-EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
-
-#if defined(THREADED_RTS)
-
-/* -----------------------------------------------------------------------------
- * Locking/unlocking closures
- *
- * This is used primarily in the implementation of MVars.
- * -------------------------------------------------------------------------- */
-
-// We want a callable copy of reallyLockClosure() so that we can refer to it
-// from .cmm files compiled using the native codegen, so these are given
-// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've
-// kept it INLINE_HEADER.
-EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
-{
- StgWord info;
- do {
- uint32_t i = 0;
- do {
- info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
- if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
- } while (++i < SPIN_COUNT);
- yieldThread();
- } while (1);
-}
-
-INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
-{
- if (n_capabilities == 1) {
- return (StgInfoTable *)p->header.info;
- }
- else {
- return reallyLockClosure(p);
- }
-}
-
-// ToDo: consider splitting tryLockClosure into reallyTryLockClosure,
-// same as lockClosure
-EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p)
-{
- StgWord info;
- if (n_capabilities == 1) {
- return (StgInfoTable *)p->header.info;
- }
- else {
- info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
- if (info != (W_)&stg_WHITEHOLE_info) {
- return (StgInfoTable *)info;
- } else {
- return NULL;
- }
- }
-}
-
-#else /* !THREADED_RTS */
-
-EXTERN_INLINE StgInfoTable *
-reallyLockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-INLINE_HEADER StgInfoTable *
-lockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-EXTERN_INLINE StgInfoTable *
-tryLockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-#endif /* THREADED_RTS */
-
-EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info)
-{
- // This is a strictly ordered write, so we need a write_barrier():
- write_barrier();
- p->header.info = info;
-}
-
-// Handy specialised versions of lockClosure()/unlockClosure()
-INLINE_HEADER void lockTSO(StgTSO *tso);
-INLINE_HEADER void lockTSO(StgTSO *tso)
-{ lockClosure((StgClosure *)tso); }
-
-INLINE_HEADER void unlockTSO(StgTSO *tso);
-INLINE_HEADER void unlockTSO(StgTSO *tso)
-{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); }
-
-#endif /* CMINUSMINUS */
-
-#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */