diff options
author | Ivan Maidanski <ivmai@mail.ru> | 2016-04-25 21:26:28 +0300 |
---|---|---|
committer | Ivan Maidanski <ivmai@mail.ru> | 2016-04-25 21:26:28 +0300 |
commit | 98033869ace6a6ef471e5d64e769d38aad8edce3 (patch) | |
tree | cee5512a3efc5da3f268c2501761b7298621ee8e | |
parent | 49b9286a8c304283ece186f1fabb27b587e7a9a6 (diff) | |
download | libatomic_ops-98033869ace6a6ef471e5d64e769d38aad8edce3.tar.gz |
Use GCC atomic intrinsics for x86 and x64 (gcc 4.8+ and clang 3.4+)
* src/atomic_ops/sysdeps/gcc/generic.h (AO_GCC_HAVE_double_SYNC_CAS):
Do not define if AO_SKIPATOMIC_double_compare_and_swap_ANY.
* src/atomic_ops/sysdeps/gcc/x86.h (AO_GCC_ATOMIC_TEST_AND_SET): Define
if gcc4.8+ (but not Intel compiler) or clang3.4+ (if not
AO_DISABLE_GCC_ATOMICS).
* src/atomic_ops/sysdeps/gcc/x86.h [AO_GCC_ATOMIC_TEST_AND_SET]: Do not
include all_aligned_atomic_load_store.h, test_and_set_t_is_char.h,
ordered_except_wr.h.
* src/atomic_ops/sysdeps/gcc/x86.h (AO_nop_full, AO_fetch_and_add_full,
AO_char_fetch_and_add_full, AO_short_fetch_and_add_full, AO_and_full,
AO_or_full, AO_xor_full, AO_test_and_set_full,
AO_compare_and_swap_full) [AO_GCC_ATOMIC_TEST_AND_SET]: Do not define.
* src/atomic_ops/sysdeps/gcc/x86.h: Include standard_ao_double_t.h if
AO_PREFER_BUILTIN_ATOMICS, or AO_GCC_ATOMIC_TEST_AND_SET and not
clang/x86.
* src/atomic_ops/sysdeps/gcc/x86.h (AO_double_load_acquire,
AO_double_store_release, AO_SKIPATOMIC_double_compare_and_swap_ANY):
Define if AO_GCC_ATOMIC_TEST_AND_SET and clang/x86 (unless
AO_PREFER_BUILTIN_ATOMICS).
* src/atomic_ops/sysdeps/gcc/x86.h
(AO_compare_double_and_swap_double_full) [x86]: Define also for
clang/x86 if AO_GCC_ATOMIC_TEST_AND_SET (unless
AO_PREFER_BUILTIN_ATOMICS).
* src/atomic_ops/sysdeps/gcc/x86.h
(AO_compare_double_and_swap_double_full) [x86_64]: Remove TODO item;
check __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 to detect presence of
cmpxchg16b.
* src/atomic_ops/sysdeps/gcc/x86.h: Include generic.h if
AO_GCC_ATOMIC_TEST_AND_SET.
* src/atomic_ops/sysdeps/standard_ao_double_t.h (double_ptr_storage):
Use unsigned __int128 instead of __m128 for GCC x86_64 atomic
intrinsics.
-rw-r--r-- | src/atomic_ops/sysdeps/gcc/generic.h | 6 | ||||
-rw-r--r-- | src/atomic_ops/sysdeps/gcc/x86.h | 62 | ||||
-rw-r--r-- | src/atomic_ops/sysdeps/standard_ao_double_t.h | 20 |
3 files changed, 66 insertions, 22 deletions
diff --git a/src/atomic_ops/sysdeps/gcc/generic.h b/src/atomic_ops/sysdeps/gcc/generic.h index 256a97e..69116f4 100644 --- a/src/atomic_ops/sysdeps/gcc/generic.h +++ b/src/atomic_ops/sysdeps/gcc/generic.h @@ -160,9 +160,11 @@ # define AO_HAVE_double_store_release # endif -# if (__SIZEOF_SIZE_T__ == 4 && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \ +# if ((__SIZEOF_SIZE_T__ == 4 \ + && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \ || (__SIZEOF_SIZE_T__ == 8 /* half of AO_double_t */ \ - && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16)) + && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16))) \ + && !defined(AO_SKIPATOMIC_double_compare_and_swap_ANY) # define AO_GCC_HAVE_double_SYNC_CAS # endif diff --git a/src/atomic_ops/sysdeps/gcc/x86.h b/src/atomic_ops/sysdeps/gcc/x86.h index db66dda..dc25505 100644 --- a/src/atomic_ops/sysdeps/gcc/x86.h +++ b/src/atomic_ops/sysdeps/gcc/x86.h @@ -16,6 +16,15 @@ * Some of the machine specific code was borrowed from our GC distribution. */ +#if (((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) \ + && !defined(__INTEL_COMPILER)) /* TODO: test and enable icc */ \ + || __clang_major__ > 3 \ + || (__clang_major__ == 3 && __clang_minor__ >= 4)) \ + && !defined(AO_DISABLE_GCC_ATOMICS) +# define AO_GCC_ATOMIC_TEST_AND_SET + +#else /* AO_DISABLE_GCC_ATOMICS */ + /* The following really assume we have a 486 or better. Unfortunately */ /* gcc doesn't define a suitable feature test macro based on command */ /* line options. */ @@ -174,7 +183,31 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val, } #define AO_HAVE_fetch_compare_and_swap_full -#if !defined(__x86_64__) && !defined(AO_USE_SYNC_CAS_BUILTIN) + /* Real X86 implementations, except for some old 32-bit WinChips, */ + /* appear to enforce ordering between memory operations, EXCEPT that */ + /* a later read can pass earlier writes, presumably due to the */ + /* visible presence of store buffers. */ + /* We ignore both the WinChips and the fact that the official specs */ + /* seem to be much weaker (and arguably too weak to be usable). */ +# include "../ordered_except_wr.h" + +#endif /* AO_DISABLE_GCC_ATOMICS */ + +#if (defined(AO_PREFER_BUILTIN_ATOMICS) || !defined(__clang__) \ + || defined(__x86_64__)) && defined(AO_GCC_ATOMIC_TEST_AND_SET) + + /* As of clang-3.8 i686 (NDK r11c), it requires -latomic for all the */ + /* double-wide operations. For now, we fall back to the */ + /* non-intrinsic implementation if clang/x86. */ + /* TODO: Refine for newer clang releases. */ + +# if defined(__ILP32__) || !defined(__x86_64__) /* 32-bit AO_t */ \ + || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) /* 64-bit AO_t */ +# include "../standard_ao_double_t.h" +# endif + +#elif !defined(__x86_64__) && (!defined(AO_USE_SYNC_CAS_BUILTIN) \ + || defined(AO_GCC_ATOMIC_TEST_AND_SET)) # include "../standard_ao_double_t.h" /* Reading or writing a quadword aligned on a 64-bit boundary is */ @@ -183,6 +216,18 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val, # define AO_ACCESS_double_CHECK_ALIGNED # include "../loadstore/double_atomic_load_store.h" +# ifdef AO_GCC_ATOMIC_TEST_AND_SET + /* Double-wide loads and stores are ordered. */ +# define AO_double_load_acquire(addr) AO_double_load_read(addr) +# define AO_HAVE_double_load_acquire + +# define AO_double_store_release(addr, val) \ + (AO_nop_write(), AO_double_store(addr, val)) +# define AO_HAVE_double_store_release + +# define AO_SKIPATOMIC_double_compare_and_swap_ANY +# endif /* AO_GCC_ATOMIC_TEST_AND_SET */ + /* Returns nonzero if the comparison succeeded. */ /* Really requires at least a Pentium. */ AO_INLINE int @@ -298,9 +343,8 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val, /* Thus, currently, the only way to implement lock-free double_load */ /* and double_store on x86_64 is to use CMPXCHG16B (if available). */ -/* TODO: Test some gcc macro to detect presence of cmpxchg16b. */ - -# ifdef AO_CMPXCHG16B_AVAILABLE +# if defined(AO_CMPXCHG16B_AVAILABLE) \ + || defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16) # include "../standard_ao_double_t.h" /* NEC LE-IT: older AMD Opterons are missing this instruction. */ @@ -351,10 +395,6 @@ AO_fetch_compare_and_swap_full(volatile AO_t *addr, AO_t old_val, #endif /* x86_64 && !ILP32 */ -/* Real X86 implementations, except for some old 32-bit WinChips, */ -/* appear to enforce ordering between memory operations, EXCEPT that */ -/* a later read can pass earlier writes, presumably due to the visible */ -/* presence of store buffers. */ -/* We ignore both the WinChips and the fact that the official specs */ -/* seem to be much weaker (and arguably too weak to be usable). */ -#include "../ordered_except_wr.h" +#ifdef AO_GCC_ATOMIC_TEST_AND_SET +# include "generic.h" +#endif diff --git a/src/atomic_ops/sysdeps/standard_ao_double_t.h b/src/atomic_ops/sysdeps/standard_ao_double_t.h index bf76979..2fcf219 100644 --- a/src/atomic_ops/sysdeps/standard_ao_double_t.h +++ b/src/atomic_ops/sysdeps/standard_ao_double_t.h @@ -20,20 +20,22 @@ * SOFTWARE. */ -/* For 64-bit systems, we extend the double type to hold two int64's. */ -/* x86-64 (except for x32): __m128 serves as a placeholder which also */ -/* requires the compiler to align it on 16-byte boundary (as required */ -/* by cmpxchg16). */ -/* Similar things could be done for PPC 64-bit using a VMX data type. */ +/* For 64-bit systems, we expect the double type to hold two int64's. */ -#if ((defined(__x86_64__) && __GNUC__ >= 4) || defined(_WIN64)) \ - && !defined(__ILP32__) +#if ((defined(__x86_64__) && defined(AO_GCC_ATOMIC_TEST_AND_SET)) \ + || defined(__aarch64__)) && !defined(__ILP32__) + /* x86-64: __m128 is not applicable to atomic intrinsics. */ + typedef unsigned __int128 double_ptr_storage; +#elif ((defined(__x86_64__) && __GNUC__ >= 4) || defined(_WIN64)) \ + && !defined(__ILP32__) + /* x86-64 (except for x32): __m128 serves as a placeholder which also */ + /* requires the compiler to align it on 16-byte boundary (as required */ + /* by cmpxchg16b). */ + /* Similar things could be done for PPC 64-bit using a VMX data type. */ # include <xmmintrin.h> typedef __m128 double_ptr_storage; #elif defined(_WIN32) && !defined(__GNUC__) typedef unsigned __int64 double_ptr_storage; -#elif defined(__aarch64__) - typedef unsigned __int128 double_ptr_storage; #else typedef unsigned long long double_ptr_storage; #endif |