5 files changed, 96 insertions, 52 deletions
diff --git a/atomic/unix/builtins.c b/atomic/unix/builtins.c
index 22b828c3c..e7b117c71 100644
--- a/atomic/unix/builtins.c
+++ b/atomic/unix/builtins.c
@@ -18,10 +18,11 @@
 
 #ifdef USE_ATOMICS_BUILTINS
 
-#if defined(__arm__) || defined(__powerpc__) || defined(__powerpc64__)
-#define WEAK_MEMORY_ORDERING 1
-#else
+#if defined(__i386__) || defined(__x86_64__) \
+    || defined(__s390__) || defined(__s390x__)
 #define WEAK_MEMORY_ORDERING 0
+#else
+#define WEAK_MEMORY_ORDERING 1
 #endif
 
 APR_DECLARE(apr_status_t) apr_atomic_init(apr_pool_t *p)
diff --git a/atomic/unix/builtins64.c b/atomic/unix/builtins64.c
index 7d8422548..f0cdb309b 100644
--- a/atomic/unix/builtins64.c
+++ b/atomic/unix/builtins64.c
@@ -18,17 +18,18 @@
 
 #ifdef USE_ATOMICS_BUILTINS64
 
-#if defined(__arm__) || defined(__powerpc__) || defined(__powerpc64__)
-#define WEAK_MEMORY_ORDERING 1
-#else
+#if defined(__i386__) || defined(__x86_64__) \
+    || defined(__s390__) || defined(__s390x__)
 #define WEAK_MEMORY_ORDERING 0
+#else
+#define WEAK_MEMORY_ORDERING 1
 #endif
 
 APR_DECLARE(apr_uint64_t) apr_atomic_read64(volatile apr_uint64_t *mem)
 {
 #if HAVE__ATOMIC_BUILTINS
     return __atomic_load_n(mem, __ATOMIC_SEQ_CST);
-#elif WEAK_MEMORY_ORDERING
+#elif WEAK_MEMORY_ORDERING || APR_SIZEOF_VOIDP < 8
     /* No __sync_load() available => apr_atomic_add64(mem, 0) */
     return __sync_fetch_and_add(mem, 0);
 #else
@@ -40,7 +41,7 @@ APR_DECLARE(void) apr_atomic_set64(volatile apr_uint64_t *mem, apr_uint64_t val)
 {
 #if HAVE__ATOMIC_BUILTINS
     __atomic_store_n(mem, val, __ATOMIC_SEQ_CST);
-#elif WEAK_MEMORY_ORDERING
+#elif WEAK_MEMORY_ORDERING || APR_SIZEOF_VOIDP < 8
     /* No __sync_store() available => apr_atomic_xchg64(mem, val) */
     __sync_synchronize();
     __sync_lock_test_and_set(mem, val);
diff --git a/atomic/unix/mutex64.c b/atomic/unix/mutex64.c
index 9fc44af61..a6a7ceb0b 100644
--- a/atomic/unix/mutex64.c
+++ b/atomic/unix/mutex64.c
@@ -96,7 +96,14 @@ apr_status_t apr__atomic_generic64_init(apr_pool_t *p)
 
 APR_DECLARE(apr_uint64_t) apr_atomic_read64(volatile apr_uint64_t *mem)
 {
-    return *mem;
+    apr_uint64_t cur_value;
+    DECLARE_MUTEX_LOCKED(mutex, mem);
+
+    cur_value = *mem;
+
+    MUTEX_UNLOCK(mutex);
+
+    return cur_value;
 }
 
 APR_DECLARE(void) apr_atomic_set64(volatile apr_uint64_t *mem, apr_uint64_t val)
diff --git a/configure.in b/configure.in
index 46440d151..294654c7f 100644
--- a/configure.in
+++ b/configure.in
@@ -552,31 +552,35 @@ AC_CACHE_CHECK([whether the compiler provides 64bit atomic builtins], [ap_cv_ato
 [AC_TRY_RUN([
 #if HAVE_STDINT_H
 #include <stdint.h>
+typedef uint64_t u64_t;
+#else
+typedef unsigned long long u64_t;
 #endif
 int main(int argc, const char *const *argv)
 {
-#if HAVE_STDINT_H
-    uint64_t val = 1010, tmp, *mem = &val;
-#else
-    unsigned long long val = 1010, tmp, *mem = &val;
-#endif
-
-    if (__sync_fetch_and_add(&val, 1010) != 1010 || val != 2020)
+    struct {
+        char pad0;
+        u64_t val;
+    } s;
+    u64_t *mem = &s.val, tmp;
+
+    s.val = 1010;
+    if (__sync_fetch_and_add(&s.val, 1010) != 1010 || s.val != 2020)
         return 1;
 
-    tmp = val;
-    if (__sync_fetch_and_sub(mem, 1010) != tmp || val != 1010)
+    tmp = s.val;
+    if (__sync_fetch_and_sub(mem, 1010) != tmp || s.val != 1010)
         return 1;
 
-    if (__sync_sub_and_fetch(&val, 1010) != 0 || val != 0)
+    if (__sync_sub_and_fetch(&s.val, 1010) != 0 || s.val != 0)
         return 1;
 
     tmp = 3030;
-    if (__sync_val_compare_and_swap(mem, 0, tmp) != 0 || val != tmp)
+    if (__sync_val_compare_and_swap(mem, 0, tmp) != 0 || s.val != tmp)
         return 1;
 
     __sync_synchronize();
-    if (__sync_lock_test_and_set(&val, 4040) != 3030)
+    if (__sync_lock_test_and_set(&s.val, 4040) != 3030)
         return 1;
 
     return 0;
@@ -586,31 +590,45 @@ AC_CACHE_CHECK([whether the compiler provides 64bit __atomic builtins], [ap_cv__
 [AC_TRY_RUN([
 #if HAVE_STDINT_H
 #include <stdint.h>
+typedef uint64_t u64_t;
+#else
+typedef unsigned long long u64_t;
 #endif
+static int test_always_lock_free(volatile u64_t *val)
+{
+    return __atomic_always_lock_free(sizeof(*val), val);
+}
 int main(int argc, const char *const *argv)
 {
-#if HAVE_STDINT_H
-    uint64_t val = 1010, tmp, *mem = &val;
-#else
-    unsigned long long val = 1010, tmp, *mem = &val;
-#endif
+    struct {
+        char pad0;
+        u64_t val;
+        char pad1;
+        u64_t tmp;
+    } s;
+    u64_t *mem = &s.val;
+
+    /* check if alignment matters (no fallback to libatomic) */
+    if (!test_always_lock_free(&s.val))
+        return 1;
 
-    if (__atomic_fetch_add(&val, 1010, __ATOMIC_SEQ_CST) != 1010 || val != 2020)
+    s.val = 1010;
+    if (__atomic_fetch_add(&s.val, 1010, __ATOMIC_SEQ_CST) != 1010 || s.val != 2020)
         return 1;
 
-    tmp = val;
-    if (__atomic_fetch_sub(mem, 1010, __ATOMIC_SEQ_CST) != tmp || val != 1010)
+    s.tmp = s.val;
+    if (__atomic_fetch_sub(mem, 1010, __ATOMIC_SEQ_CST) != s.tmp || s.val != 1010)
         return 1;
 
-    if (__atomic_sub_fetch(&val, 1010, __ATOMIC_SEQ_CST) != 0 || val != 0)
+    if (__atomic_sub_fetch(&s.val, 1010, __ATOMIC_SEQ_CST) != 0 || s.val != 0)
         return 1;
 
-    tmp = val;
-    if (!__atomic_compare_exchange_n(mem, &tmp, 3030, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-            || tmp != 0)
+    s.tmp = s.val;
+    if (!__atomic_compare_exchange_n(mem, &s.tmp, 3030, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+            || s.tmp != 0)
         return 1;
 
-    if (__atomic_exchange_n(&val, 4040, __ATOMIC_SEQ_CST) != 3030)
+    if (__atomic_exchange_n(&s.val, 4040, __ATOMIC_SEQ_CST) != 3030)
         return 1;
 
     return 0;
diff --git a/test/testatomic.c b/test/testatomic.c
index 4b18bb61e..efcc148b4 100644
--- a/test/testatomic.c
+++ b/test/testatomic.c
@@ -356,10 +356,18 @@ void *APR_THREAD_FUNC thread_func_atomic64(apr_thread_t *thd, void *data);
 
 apr_thread_mutex_t *thread_lock;
 apr_thread_mutex_t *thread_lock64;
-volatile apr_uint32_t mutex_locks = 0;
-volatile apr_uint64_t mutex_locks64 = 0;
-volatile apr_uint32_t atomic_ops = 0;
-volatile apr_uint64_t atomic_ops64 = 0;
+apr_uint32_t mutex_locks = 0;
+apr_uint64_t mutex_locks64 = 0;
+apr_uint32_t atomic_ops = 0;
+apr_uint64_t atomic_ops64 = 0;
+#ifndef CACHELINE_SIZE
+#define CACHELINE_SIZE 64
+#endif
+struct {
+    char pad[CACHELINE_SIZE - 5];
+    apr_uint64_t ops64;
+} atomic_pad __attribute__((aligned(CACHELINE_SIZE)));
+
 apr_status_t exit_ret_val = 123; /* just some made up number to check on later */
 
 #define NUM_THREADS 40
@@ -639,13 +647,14 @@ void *APR_THREAD_FUNC thread_func_mutex64(apr_thread_t *thd, void *data)
 
 void *APR_THREAD_FUNC thread_func_atomic64(apr_thread_t *thd, void *data)
 {
+    apr_uint64_t *ops64 = data;
     int i;
 
     for (i = 0; i < NUM_ITERATIONS ; i++) {
-        apr_atomic_inc64(&atomic_ops64);
-        apr_atomic_add64(&atomic_ops64, 2);
-        apr_atomic_dec64(&atomic_ops64);
-        apr_atomic_dec64(&atomic_ops64);
+        apr_atomic_inc64(ops64);
+        apr_atomic_add64(ops64, 2);
+        apr_atomic_dec64(ops64);
+        apr_atomic_dec64(ops64);
     }
     apr_thread_exit(thd, exit_ret_val);
     return NULL;
@@ -653,6 +662,7 @@ void *APR_THREAD_FUNC thread_func_atomic64(apr_thread_t *thd, void *data)
 
 static void test_atomics_threaded64(abts_case *tc, void *data)
 {
+    apr_uint64_t *ops64 = data;
     apr_thread_t *t1[NUM_THREADS];
     apr_thread_t *t2[NUM_THREADS];
     apr_status_t rv;
@@ -662,13 +672,16 @@ static void test_atomics_threaded64(abts_case *tc, void *data)
     pthread_setconcurrency(8);
 #endif
 
+    mutex_locks64 = 0;
+    apr_atomic_set64(ops64, 0);
+
     rv = apr_thread_mutex_create(&thread_lock64, APR_THREAD_MUTEX_DEFAULT, p);
     APR_ASSERT_SUCCESS(tc, "Could not create lock", rv);
 
     for (i = 0; i < NUM_THREADS; i++) {
         apr_status_t r1, r2;
         r1 = apr_thread_create(&t1[i], NULL, thread_func_mutex64, NULL, p);
-        r2 = apr_thread_create(&t2[i], NULL, thread_func_atomic64, NULL, p);
+        r2 = apr_thread_create(&t2[i], NULL, thread_func_atomic64, ops64, p);
         ABTS_ASSERT(tc, "Failed creating threads", !r1 && !r2);
     }
 
@@ -683,7 +696,7 @@ static void test_atomics_threaded64(abts_case *tc, void *data)
 
     ABTS_ULLONG_EQUAL(tc, NUM_THREADS * NUM_ITERATIONS, mutex_locks64);
     ABTS_ULLONG_EQUAL(tc, NUM_THREADS * NUM_ITERATIONS,
-                      apr_atomic_read64(&atomic_ops64));
+                      apr_atomic_read64(ops64));
 
     rv = apr_thread_mutex_destroy(thread_lock64);
     ABTS_ASSERT(tc, "Failed creating threads", rv == APR_SUCCESS);
@@ -878,11 +891,12 @@ static void test_atomics_busyloop_threaded64(abts_case *tc, void *data)
 
 static void *APR_THREAD_FUNC test_func_set64(apr_thread_t *thd, void *data)
 {
+    apr_uint64_t *ops64 = data;
     int i;
 
     for (i = 0; i < 1000 * 1000; i++) {
-        apr_atomic_set64(&atomic_ops64, APR_UINT64_C(0x1111222233334444));
-        apr_atomic_set64(&atomic_ops64, APR_UINT64_C(0x4444555566667777));
+        apr_atomic_set64(ops64, APR_UINT64_C(0x1111222233334444));
+        apr_atomic_set64(ops64, APR_UINT64_C(0x4444555566667777));
     }
 
     apr_thread_exit(thd, APR_SUCCESS);
@@ -891,16 +905,17 @@ static void *APR_THREAD_FUNC test_func_set64(apr_thread_t *thd, void *data)
 
 static void test_atomics_threaded_setread64(abts_case *tc, void *data)
 {
-    apr_status_t retval;
+    apr_uint64_t *ops64 = data;
     apr_thread_t *thread;
+    apr_status_t retval;
     int i;
 
-    apr_atomic_set64(&atomic_ops64, APR_UINT64_C(0x1111222233334444));
+    apr_atomic_set64(ops64, APR_UINT64_C(0x1111222233334444));
 
-    apr_thread_create(&thread, NULL, test_func_set64, NULL, p);
+    apr_thread_create(&thread, NULL, test_func_set64, ops64, p);
 
     for (i = 0; i < 1000 * 1000 * 2; i++) {
-        apr_uint64_t val = apr_atomic_read64(&atomic_ops64);
+        apr_uint64_t val = apr_atomic_read64(ops64);
 
         if (val != APR_UINT64_C(0x1111222233334444) &&
             val != APR_UINT64_C(0x4444555566667777))
@@ -950,10 +965,12 @@ abts_suite *testatomic(abts_suite *suite)
 
 #if APR_HAS_THREADS
     abts_run_test(suite, test_atomics_threaded, NULL);
-    abts_run_test(suite, test_atomics_threaded64, NULL);
+    abts_run_test(suite, test_atomics_threaded64, &atomic_ops64);
+    abts_run_test(suite, test_atomics_threaded64, &atomic_pad.ops64);
     abts_run_test(suite, test_atomics_busyloop_threaded, NULL);
     abts_run_test(suite, test_atomics_busyloop_threaded64, NULL);
-    abts_run_test(suite, test_atomics_threaded_setread64, NULL);
+    abts_run_test(suite, test_atomics_threaded_setread64, &atomic_ops64);
+    abts_run_test(suite, test_atomics_threaded_setread64, &atomic_pad.ops64);
 #endif
 
     return suite;