summaryrefslogtreecommitdiff
path: root/rts/SMPClosureOps.h
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/SMPClosureOps.h
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/SMPClosureOps.h')
-rw-r--r--rts/SMPClosureOps.h129
1 files changed, 129 insertions, 0 deletions
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
new file mode 100644
index 0000000000..39cde45544
--- /dev/null
+++ b/rts/SMPClosureOps.h
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2005-2013
+ *
+ * Macros for THREADED_RTS support
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef RTS_STORAGE_SMPCLOSUREOPS_H
+#define RTS_STORAGE_SMPCLOSUREOPS_H
+
+#include "BeginPrivate.h"
+
+#ifdef CMINUSMINUS
+
+/* Lock closure, equivalent to ccall lockClosure but the condition is inlined.
+ * Arguments are swapped for uniformity with unlockClosure. */
+#if defined(THREADED_RTS)
+#define LOCK_CLOSURE(closure, info) \
+ if (CInt[n_capabilities] == 1 :: CInt) { \
+ info = GET_INFO(closure); \
+ } else { \
+ ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \
+ }
+#else
+#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure)
+#endif
+
+#define unlockClosure(ptr,info) \
+ prim_write_barrier; \
+ StgHeader_info(ptr) = info;
+
+#else
+
+INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p);
+EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p);
+EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p);
+EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
+
+#if defined(THREADED_RTS)
+
+/* -----------------------------------------------------------------------------
+ * Locking/unlocking closures
+ *
+ * This is used primarily in the implementation of MVars.
+ * -------------------------------------------------------------------------- */
+
+// We want a callable copy of reallyLockClosure() so that we can refer to it
+// from .cmm files compiled using the native codegen, so these are given
+// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've
+// kept it INLINE_HEADER.
+EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
+{
+ StgWord info;
+ do {
+ uint32_t i = 0;
+ do {
+ info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
+ if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+ } while (++i < SPIN_COUNT);
+ yieldThread();
+ } while (1);
+}
+
+INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
+{
+ if (n_capabilities == 1) {
+ return (StgInfoTable *)p->header.info;
+ }
+ else {
+ return reallyLockClosure(p);
+ }
+}
+
+// ToDo: consider splitting tryLockClosure into reallyTryLockClosure,
+// same as lockClosure
+EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p)
+{
+ StgWord info;
+ if (n_capabilities == 1) {
+ return (StgInfoTable *)p->header.info;
+ }
+ else {
+ info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
+ if (info != (W_)&stg_WHITEHOLE_info) {
+ return (StgInfoTable *)info;
+ } else {
+ return NULL;
+ }
+ }
+}
+
+#else /* !THREADED_RTS */
+
+EXTERN_INLINE StgInfoTable *
+reallyLockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+INLINE_HEADER StgInfoTable *
+lockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+EXTERN_INLINE StgInfoTable *
+tryLockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+#endif /* THREADED_RTS */
+
+EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info)
+{
+ // This is a strictly ordered write, so we need a write_barrier():
+ write_barrier();
+ p->header.info = info;
+}
+
+// Handy specialised versions of lockClosure()/unlockClosure()
+INLINE_HEADER void lockTSO(StgTSO *tso);
+INLINE_HEADER void lockTSO(StgTSO *tso)
+{ lockClosure((StgClosure *)tso); }
+
+INLINE_HEADER void unlockTSO(StgTSO *tso);
+INLINE_HEADER void unlockTSO(StgTSO *tso)
+{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); }
+
+#endif /* CMINUSMINUS */
+
+#include "EndPrivate.h"
+
+#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */