summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIvan Maidanski <ivmai@mail.ru>2016-04-25 21:26:28 +0300
committerIvan Maidanski <ivmai@mail.ru>2016-04-25 21:26:28 +0300
commit98033869ace6a6ef471e5d64e769d38aad8edce3 (patch)
treecee5512a3efc5da3f268c2501761b7298621ee8e
parent49b9286a8c304283ece186f1fabb27b587e7a9a6 (diff)
downloadlibatomic_ops-98033869ace6a6ef471e5d64e769d38aad8edce3.tar.gz
Use GCC atomic intrinsics for x86 and x64 (gcc 4.8+ and clang 3.4+)
* src/atomic_ops/sysdeps/gcc/generic.h (AO_GCC_HAVE_double_SYNC_CAS): Do not define if AO_SKIPATOMIC_double_compare_and_swap_ANY. * src/atomic_ops/sysdeps/gcc/x86.h (AO_GCC_ATOMIC_TEST_AND_SET): Define if gcc4.8+ (but not Intel compiler) or clang3.4+ (if not AO_DISABLE_GCC_ATOMICS). * src/atomic_ops/sysdeps/gcc/x86.h [AO_GCC_ATOMIC_TEST_AND_SET]: Do not include all_aligned_atomic_load_store.h, test_and_set_t_is_char.h, ordered_except_wr.h. * src/atomic_ops/sysdeps/gcc/x86.h (AO_nop_full, AO_fetch_and_add_full, AO_char_fetch_and_add_full, AO_short_fetch_and_add_full, AO_and_full, AO_or_full, AO_xor_full, AO_test_and_set_full, AO_compare_and_swap_full) [AO_GCC_ATOMIC_TEST_AND_SET]: Do not define. * src/atomic_ops/sysdeps/gcc/x86.h: Include standard_ao_double_t.h if AO_PREFER_BUILTIN_ATOMICS, or AO_GCC_ATOMIC_TEST_AND_SET and not clang/x86. * src/atomic_ops/sysdeps/gcc/x86.h (AO_double_load_acquire, AO_double_store_release, AO_SKIPATOMIC_double_compare_and_swap_ANY): Define if AO_GCC_ATOMIC_TEST_AND_SET and clang/x86 (unless AO_PREFER_BUILTIN_ATOMICS). * src/atomic_ops/sysdeps/gcc/x86.h (AO_compare_double_and_swap_double_full) [x86]: Define also for clang/x86 if AO_GCC_ATOMIC_TEST_AND_SET (unless AO_PREFER_BUILTIN_ATOMICS). * src/atomic_ops/sysdeps/gcc/x86.h (AO_compare_double_and_swap_double_full) [x86_64]: Remove TODO item; check __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 to detect presence of cmpxchg16b. * src/atomic_ops/sysdeps/gcc/x86.h: Include generic.h if AO_GCC_ATOMIC_TEST_AND_SET. * src/atomic_ops/sysdeps/standard_ao_double_t.h (double_ptr_storage): Use unsigned __int128 instead of __m128 for GCC x86_64 atomic intrinsics.
-rw-r--r--src/atomic_ops/sysdeps/gcc/generic.h6
-rw-r--r--src/atomic_ops/sysdeps/gcc/x86.h62
-rw-r--r--src/atomic_ops/sysdeps/standard_ao_double_t.h20
3 files changed, 66 insertions, 22 deletions
diff --git a/src/atomic_ops/sysdeps/gcc/generic.h b/src/atomic_ops/sysdeps/gcc/generic.h
index 256a97e..69116f4 100644
--- a/src/atomic_ops/sysdeps/gcc/generic.h
+++ b/src/atomic_ops/sysdeps/gcc/generic.h
@@ -160,9 +160,11 @@
# define AO_HAVE_double_store_release
# endif
-# if (__SIZEOF_SIZE_T__ == 4 && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \
+# if ((__SIZEOF_SIZE_T__ == 4 \
+ && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \
|| (__SIZEOF_SIZE_T__ == 8 /* half of AO_double_t */ \
- && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16))
+ && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16))) \
+ && !defined(AO_SKIPATOMIC_double_compare_and_swap_ANY)
# define AO_GCC_HAVE_double_SYNC_CAS
# endif
diff --git a/src/atomic_ops/sysdeps/gcc/x86.h b/src/atomic_ops/sysdeps/gcc/x86.h
index db66dda..dc25505 100644
--- a/src/atomic_ops/sysdeps/gcc/x86.h
+++ b/src/atomic_ops/sysdeps/gcc/x86.h
@@ -16,6 +16,15 @@
* Some of the machine specific code was borrowed from our GC distribution.
*/
+#if (((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) \
+ && !defined(__INTEL_COMPILER)) /* TODO: test and enable icc */ \
+ || __clang_major__ > 3 \
+ || (__clang_major__ == 3 && __clang_minor__ >= 4)) \
+ && !defined(AO_DISABLE_GCC_ATOMICS)
+# define AO_GCC_ATOMIC_TEST_AND_SET
+
+#else /* AO_DISABLE_GCC_ATOMICS */
+
/* The following really assume we have a 486 or better. Unfortunately */
/* gcc doesn't define a suitable feature test macro based on command */
/* line options. */
@@ -174,7 +183,31 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val,
}
#define AO_HAVE_fetch_compare_and_swap_full
-#if !defined(__x86_64__) && !defined(AO_USE_SYNC_CAS_BUILTIN)
+ /* Real X86 implementations, except for some old 32-bit WinChips, */
+ /* appear to enforce ordering between memory operations, EXCEPT that */
+ /* a later read can pass earlier writes, presumably due to the */
+ /* visible presence of store buffers. */
+ /* We ignore both the WinChips and the fact that the official specs */
+ /* seem to be much weaker (and arguably too weak to be usable). */
+# include "../ordered_except_wr.h"
+
+#endif /* AO_DISABLE_GCC_ATOMICS */
+
+#if (defined(AO_PREFER_BUILTIN_ATOMICS) || !defined(__clang__) \
+ || defined(__x86_64__)) && defined(AO_GCC_ATOMIC_TEST_AND_SET)
+
+ /* As of clang-3.8 i686 (NDK r11c), it requires -latomic for all the */
+ /* double-wide operations. For now, we fall back to the */
+ /* non-intrinsic implementation if clang/x86. */
+ /* TODO: Refine for newer clang releases. */
+
+# if defined(__ILP32__) || !defined(__x86_64__) /* 32-bit AO_t */ \
+ || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) /* 64-bit AO_t */
+# include "../standard_ao_double_t.h"
+# endif
+
+#elif !defined(__x86_64__) && (!defined(AO_USE_SYNC_CAS_BUILTIN) \
+ || defined(AO_GCC_ATOMIC_TEST_AND_SET))
# include "../standard_ao_double_t.h"
/* Reading or writing a quadword aligned on a 64-bit boundary is */
@@ -183,6 +216,18 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val,
# define AO_ACCESS_double_CHECK_ALIGNED
# include "../loadstore/double_atomic_load_store.h"
+# ifdef AO_GCC_ATOMIC_TEST_AND_SET
+ /* Double-wide loads and stores are ordered. */
+# define AO_double_load_acquire(addr) AO_double_load_read(addr)
+# define AO_HAVE_double_load_acquire
+
+# define AO_double_store_release(addr, val) \
+ (AO_nop_write(), AO_double_store(addr, val))
+# define AO_HAVE_double_store_release
+
+# define AO_SKIPATOMIC_double_compare_and_swap_ANY
+# endif /* AO_GCC_ATOMIC_TEST_AND_SET */
+
/* Returns nonzero if the comparison succeeded. */
/* Really requires at least a Pentium. */
AO_INLINE int
@@ -298,9 +343,8 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val,
/* Thus, currently, the only way to implement lock-free double_load */
/* and double_store on x86_64 is to use CMPXCHG16B (if available). */
-/* TODO: Test some gcc macro to detect presence of cmpxchg16b. */
-
-# ifdef AO_CMPXCHG16B_AVAILABLE
+# if defined(AO_CMPXCHG16B_AVAILABLE) \
+ || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16)
# include "../standard_ao_double_t.h"
/* NEC LE-IT: older AMD Opterons are missing this instruction. */
@@ -351,10 +395,6 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val,
#endif /* x86_64 && !ILP32 */
-/* Real X86 implementations, except for some old 32-bit WinChips, */
-/* appear to enforce ordering between memory operations, EXCEPT that */
-/* a later read can pass earlier writes, presumably due to the visible */
-/* presence of store buffers. */
-/* We ignore both the WinChips and the fact that the official specs */
-/* seem to be much weaker (and arguably too weak to be usable). */
-#include "../ordered_except_wr.h"
+#ifdef AO_GCC_ATOMIC_TEST_AND_SET
+# include "generic.h"
+#endif
diff --git a/src/atomic_ops/sysdeps/standard_ao_double_t.h b/src/atomic_ops/sysdeps/standard_ao_double_t.h
index bf76979..2fcf219 100644
--- a/src/atomic_ops/sysdeps/standard_ao_double_t.h
+++ b/src/atomic_ops/sysdeps/standard_ao_double_t.h
@@ -20,20 +20,22 @@
* SOFTWARE.
*/
-/* For 64-bit systems, we extend the double type to hold two int64's. */
-/* x86-64 (except for x32): __m128 serves as a placeholder which also */
-/* requires the compiler to align it on 16-byte boundary (as required */
-/* by cmpxchg16). */
-/* Similar things could be done for PPC 64-bit using a VMX data type. */
+/* For 64-bit systems, we expect the double type to hold two int64's. */
-#if ((defined(__x86_64__) && __GNUC__ >= 4) || defined(_WIN64)) \
- && !defined(__ILP32__)
+#if ((defined(__x86_64__) && defined(AO_GCC_ATOMIC_TEST_AND_SET)) \
+ || defined(__aarch64__)) && !defined(__ILP32__)
+ /* x86-64: __m128 is not applicable to atomic intrinsics. */
+ typedef unsigned __int128 double_ptr_storage;
+#elif ((defined(__x86_64__) && __GNUC__ >= 4) || defined(_WIN64)) \
+ && !defined(__ILP32__)
+ /* x86-64 (except for x32): __m128 serves as a placeholder which also */
+ /* requires the compiler to align it on 16-byte boundary (as required */
+ /* by cmpxchg16b). */
+ /* Similar things could be done for PPC 64-bit using a VMX data type. */
# include <xmmintrin.h>
typedef __m128 double_ptr_storage;
#elif defined(_WIN32) && !defined(__GNUC__)
typedef unsigned __int64 double_ptr_storage;
-#elif defined(__aarch64__)
- typedef unsigned __int128 double_ptr_storage;
#else
typedef unsigned long long double_ptr_storage;
#endif