diff options
author | Austin Seipp <austin@well-typed.com> | 2014-01-28 06:15:19 -0600 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2014-01-28 06:54:58 -0600 |
commit | 28b031c506122e28e0230a562a4f6fd3d0256d0c (patch) | |
tree | ecba3eb2c23549b931339e13ab9d52025b50d6fc /rts | |
parent | f9652e22da592ec0689b4919c003c67dacf28d31 (diff) | |
download | haskell-28b031c506122e28e0230a562a4f6fd3d0256d0c.tar.gz |
Refactor GCTDecl.h, and mitigate #7602 a bit
This basically cleans a lot of GCTDecl up - I found it quite hard to
read and a bit confusing. The changes are mostly cosmetic: better
delineation between the alternative cases and light touchups, and tries
to make every branch as consistent as possible.
However, this patch does have one significant effect: it will ensure
that any LLVM-based compilers will use __thread if they support it.
Before, they would simply always use pthread_getspecific and
pthread_setspecific, which are almost surely even *more* inefficient.
The details are a bit too long and boring to go into here; see #7602.
After talking with Simon, we decided to play it safe - __thread can at
least be optimized by future clang releases even further on OS X if they
choose, and it's safer until we can investigate the pthread
implementation further on Mavericks.
For Linux, the story isn't so bleak if you use Clang (for whatever
reason) - Linux directly writes to `%fs` for __thread slots (while OS X
will perform a load followed by an indirect call.) So it should still be
fairly competitive, speed-wise.
Signed-off-by: Austin Seipp <austin@well-typed.com>
Diffstat (limited to 'rts')
-rw-r--r-- | rts/sm/GCTDecl.h | 152 |
1 files changed, 93 insertions, 59 deletions
diff --git a/rts/sm/GCTDecl.h b/rts/sm/GCTDecl.h index 2c08e10783..affb852ba9 100644 --- a/rts/sm/GCTDecl.h +++ b/rts/sm/GCTDecl.h @@ -1,10 +1,10 @@ /* ----------------------------------------------------------------------------- * - * (c) The GHC Team 1998-2009 + * (c) The GHC Team 1998-2014 * * Documentation on the architecture of the Garbage Collector can be * found in the online commentary: - * + * * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC * * ---------------------------------------------------------------------------*/ @@ -14,94 +14,128 @@ #include "BeginPrivate.h" -/* ----------------------------------------------------------------------------- - The gct variable is thread-local and points to the current thread's - gc_thread structure. It is heavily accessed, so we try to put gct - into a global register variable if possible; if we don't have a - register then use gcc's __thread extension to create a thread-local - variable. - -------------------------------------------------------------------------- */ +/* The gct variable is thread-local and points to the current thread's + gc_thread structure. It is heavily accessed, and thus high + performance access is crucial to parallel (-threaded) workloads. + + First, we try to use a 'global register variable' which is a GCC + extension. This reserves the register globally. + + If that's not possible, then we need to use __thread, which is a + compiler/OS specific TLS storage mechanism (assumed to be Fast + Enough.) + + BUT, some older versions of OS X compilers (llvm-gcc, older Clangs) + do not support __thread at all. Modern clang however, does - but on + OS X it's not as fast as the Linux (which can write directly into a + segment register - see #7602.) + + If we don't support __thread then we do the absolute worst thing: + we just use pthread_getspecific and pthread_setspecific (which are + horribly slow.) +*/ -#if defined(THREADED_RTS) +#define GCT_REG_DECL(type,name,reg) register type name REG(reg); -#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg); -#ifdef llvm_CC_FLAVOR +/* -------------------------------------------------------------------------- */ + +/* First: if we're not using the threaded RTS, it's easy: just fake it. */ +#if !defined(THREADED_RTS) +extern StgWord8 the_gc_thread[]; +#define gct ((gc_thread*)&the_gc_thread) +#define SET_GCT(to) /*nothing*/ +#define DECLARE_GCT /*nothing*/ + +#else /* defined(THREADED_RTS) */ + +/* -------------------------------------------------------------------------- */ + +/* Now, llvm-gcc and some older Clang compilers do not support + __thread. So we have to fallback to the extremely slow case, + unfortunately. Note: clang_CC_FLAVOR implies llvm_CC_FLAVOR */ +#if defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 0) +#define gct ((gc_thread *)(pthread_getspecific(gctKey))) #define SET_GCT(to) (pthread_setspecific(gctKey, to)) -#else -#define SET_GCT(to) gct = (to) -#endif +#define DECLARE_GCT ThreadLocalKey gctKey; +/* -------------------------------------------------------------------------- */ +/* However, if we *are* using an LLVM based compiler with __thread + support, then use that (since LLVM doesn't support global register + variables.) */ +#elif defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 1) +extern __thread gc_thread* gct; +#define SET_GCT(to) gct = (to) +#define DECLARE_GCT __thread gc_thread* gct; -#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) -// Using __thread is better than stealing a register on x86/Linux, because -// we have too few registers available. In my tests it was worth -// about 5% in GC performance, but of course that might change as gcc -// improves. -- SDM 2009/04/03 -// -// For MacOSX, we can use an llvm-based C compiler which will store the gct -// in a thread local variable using pthreads. +/* -------------------------------------------------------------------------- */ +/* Next up: Using __thread is better than stealing a register on + x86/Linux, because we have too few registers available. In my + tests it was worth about 5% in GC performance, but of course that + might change as gcc improves. -- SDM 2009/04/03 */ +#elif (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) extern __thread gc_thread* gct; +#define SET_GCT(to) gct = (to) #define DECLARE_GCT __thread gc_thread* gct; -#elif defined(llvm_CC_FLAVOR) -// LLVM does not support the __thread extension and will generate -// incorrect code for global register variables. If we are compiling -// with a C compiler that uses an LLVM back end (clang or llvm-gcc) then we -// use pthread_getspecific() to handle the thread local storage for gct. -#define gct ((gc_thread *)(pthread_getspecific(gctKey))) -#define DECLARE_GCT ThreadLocalKey gctKey; +/* -------------------------------------------------------------------------- */ -#elif defined(sparc_HOST_ARCH) -// On SPARC we can't pin gct to a register. Names like %l1 are just offsets -// into the register window, which change on each function call. -// -// There are eight global (non-window) registers, but they're used for other purposes. -// %g0 -- always zero -// %g1 -- volatile over function calls, used by the linker -// %g2-%g3 -- used as scratch regs by the C compiler (caller saves) -// %g4 -- volatile over function calls, used by the linker -// %g5-%g7 -- reserved by the OS +/* Next up: On SPARC we can't pin gct to a register. Names like %l1 + are just offsets into the register window, which change on each + function call. + There are eight global (non-window) registers, but they're used for other + purposes: + + %g0 -- always zero + %g1 -- volatile over function calls, used by the linker + %g2-%g3 -- used as scratch regs by the C compiler (caller saves) + %g4 -- volatile over function calls, used by the linker + %g5-%g7 -- reserved by the OS +*/ +#elif defined(sparc_HOST_ARCH) extern __thread gc_thread* gct; +#define SET_GCT(to) gct = (to) #define DECLARE_GCT __thread gc_thread* gct; +/* -------------------------------------------------------------------------- */ +/* Next up: generally, if REG_Base is defined and we're *not* using + i386, then actually declare the needed register. The catch for i386 + here is that REG_Base is %ebx, but that is also used for -fPIC, so + it can't be stolen */ #elif defined(REG_Base) && !defined(i386_HOST_ARCH) -// on i386, REG_Base is %ebx which is also used for PIC, so we don't -// want to steal it - -GLOBAL_REG_DECL(gc_thread*, gct, REG_Base) +GCT_REG_DECL(gc_thread*, gct, REG_Base); +#define SET_GCT(to) gct = (to) #define DECLARE_GCT /* nothing */ +/* -------------------------------------------------------------------------- */ +/* Next up: if REG_R1 is available after checking REG_Base, we're + gonna steal it in every case we can. */ #elif defined(REG_R1) - -GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) +GCT_REG_DECL(gc_thread*, gct, REG_R1); +#define SET_GCT(to) gct = (to) #define DECLARE_GCT /* nothing */ +/* -------------------------------------------------------------------------- */ -#elif defined(__GNUC__) - +/* Finally, as an absolute fallback, if none of the above tests check + out but we *do* have __thread support, then use that. */ +#elif CC_SUPPORTS_TLS == 1 extern __thread gc_thread* gct; +#define SET_GCT(to) gct = (to) #define DECLARE_GCT __thread gc_thread* gct; -#else - -#error Cannot find a way to declare the thread-local gct +/* -------------------------------------------------------------------------- */ +/* Impossible! */ +#else +#error Cannot find a way to declare the thread-local gc variable! #endif -#else // not the threaded RTS - -extern StgWord8 the_gc_thread[]; - -#define gct ((gc_thread*)&the_gc_thread) -#define SET_GCT(to) /*nothing*/ -#define DECLARE_GCT /*nothing*/ - #endif // THREADED_RTS #include "EndPrivate.h" |