From a1c85db1ae5b85b7bd16f75642662c698b90b7c1 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jan 2021 12:02:52 +0000 Subject: Do not cas on slowpath of SpinLock unnecessarily This is a well known technique to reduce inter-CPU bus traffic while waiting for the lock by reducing the number of writes. --- rts/SpinLock.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'rts/SpinLock.c') diff --git a/rts/SpinLock.c b/rts/SpinLock.c index 5289694aa7..b362d89838 100644 --- a/rts/SpinLock.c +++ b/rts/SpinLock.c @@ -24,13 +24,45 @@ #if defined(THREADED_RTS) +#if defined(PROF_SPIN) + +ATTR_ALWAYS_INLINE static inline bool try_acquire_spin_slow_path(SpinLock * p) +{ + StgWord r; + r = cas((StgVolatilePtr)&(p->lock), 1, 0); + if (r == 0) RELAXED_ADD(&p->spin, 1); + return r != 0; +} + +#else /* !PROF_SPIN */ + +ATTR_ALWAYS_INLINE static inline bool try_acquire_spin_slow_path(SpinLock * p) +{ + StgWord r; + // Note + // + // Here we first check if we can obtain the lock without trying to cas. + // The cas instruction will add extra inter-CPU traffic on most CPU + // architectures as it has to invalidate cache lines. Rather than adding + // this traffic in the spin loop, we rather restrict it to times when the + // lock might be available. + // + // We do not need to do this when PROF_SPIN is enabled, since we write to + // the lock in both cases (acquired/not acquired). + r = RELAXED_LOAD(&p->lock); + if (r != 0) { + r = cas((StgVolatilePtr)&(p->lock), 1, 0); + } + return r != 0; +} + +#endif + void acquire_spin_lock_slow_path(SpinLock * p) { do { for (uint32_t i = 0; i < SPIN_COUNT; i++) { - StgWord32 r = cas((StgVolatilePtr)&(p->lock), 1, 0); - if (r != 0) return; - IF_PROF_SPIN(RELAXED_ADD(&p->spin, 1)); + if (try_acquire_spin_slow_path(p)) return; busy_wait_nop(); } IF_PROF_SPIN(RELAXED_ADD(&p->yield, 1)); -- cgit v1.2.1