7 files changed, 348 insertions, 336 deletions
diff --git a/include/atomic/gcc_builtins.h b/include/atomic/gcc_builtins.h
index 01ebc38707e..100ff80cacd 100644
--- a/include/atomic/gcc_builtins.h
+++ b/include/atomic/gcc_builtins.h
@@ -18,7 +18,7 @@
 
 #define make_atomic_add_body(S)                     \
   v= __sync_fetch_and_add(a, v);
-#define make_atomic_swap_body(S)                    \
+#define make_atomic_fas_body(S)                     \
   v= __sync_lock_test_and_set(a, v);
 #define make_atomic_cas_body(S)                     \
   int ## S sav;                                     \
@@ -28,7 +28,10 @@
 #ifdef MY_ATOMIC_MODE_DUMMY
 #define make_atomic_load_body(S)   ret= *a
 #define make_atomic_store_body(S)  *a= v
+#define MY_ATOMIC_MODE "gcc-builtins-up"
+
 #else
+#define MY_ATOMIC_MODE "gcc-builtins-smp"
 #define make_atomic_load_body(S)                    \
   ret= __sync_fetch_and_or(a, 0);
 #define make_atomic_store_body(S)                   \
diff --git a/include/atomic/generic-msvc.h b/include/atomic/generic-msvc.h
new file mode 100644
index 00000000000..a84cde6b2c3
--- /dev/null
+++ b/include/atomic/generic-msvc.h
@@ -0,0 +1,134 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _atomic_h_cleanup_
+#define _atomic_h_cleanup_ "atomic/generic-msvc.h"
+
+/*
+  We don't implement anything specific for MY_ATOMIC_MODE_DUMMY, always use
+  intrinsics.
+  8 and 16-bit atomics are not implemented, but it can be done if necessary.
+*/
+#undef MY_ATOMIC_HAS_8_16
+
+#include <windows.h>
+/*
+  x86 compilers (both VS2003 or VS2005) never use instrinsics, but generate 
+  function calls to kernel32 instead, even in the optimized build. 
+  We force intrinsics as described in MSDN documentation for 
+  _InterlockedCompareExchange.
+*/
+#ifdef _M_IX86
+
+#if (_MSC_VER >= 1500)
+#include <intrin.h>
+#else
+C_MODE_START
+/*Visual Studio 2003 and earlier do not have prototypes for atomic intrinsics*/
+LONG _InterlockedCompareExchange (LONG volatile *Target, LONG Value, LONG Comp);
+LONGLONG _InterlockedCompareExchange64 (LONGLONG volatile *Target,
+                                        LONGLONG Value, LONGLONG Comp);
+C_MODE_END
+
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedCompareExchange64)
+#endif
+
+#define InterlockedCompareExchange _InterlockedCompareExchange
+#define InterlockedCompareExchange64 _InterlockedCompareExchange64
+/*
+ No need to do something special for InterlockedCompareExchangePointer
+ as it is a #define to InterlockedCompareExchange. The same applies to
+ InterlockedExchangePointer. 
+*/
+#endif /*_M_IX86*/
+
+#define MY_ATOMIC_MODE "msvc-intrinsics"
+/* Implement using CAS on WIN32 */
+#define IL_COMP_EXCHG32(X,Y,Z)  \
+  InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
+#define IL_COMP_EXCHG64(X,Y,Z)  \
+  InterlockedCompareExchange64((volatile LONGLONG *)(X), \
+                               (LONGLONG)(Y),(LONGLONG)(Z))
+#define IL_COMP_EXCHGptr        InterlockedCompareExchangePointer
+
+#define make_atomic_cas_body(S)                                 \
+  int ## S initial_cmp= *cmp;                                   \
+  int ## S initial_a= IL_COMP_EXCHG ## S (a, set, initial_cmp); \
+  if (!(ret= (initial_a == initial_cmp))) *cmp= initial_a;
+
+#ifndef _M_IX86
+/* Use full set of optimised functions on WIN64 */
+#define IL_EXCHG_ADD32(X,Y)     \
+  InterlockedExchangeAdd((volatile LONG *)(X),(Y))
+#define IL_EXCHG_ADD64(X,Y)     \
+  InterlockedExchangeAdd64((volatile LONGLONG *)(X),(LONGLONG)(Y))
+#define IL_EXCHG32(X,Y)         \
+  InterlockedExchange((volatile LONG *)(X),(Y))
+#define IL_EXCHG64(X,Y)         \
+  InterlockedExchange64((volatile LONGLONG *)(X),(LONGLONG)(Y))
+#define IL_EXCHGptr             InterlockedExchangePointer
+
+#define make_atomic_add_body(S) \
+  v= IL_EXCHG_ADD ## S (a, v)
+#define make_atomic_swap_body(S) \
+  v= IL_EXCHG ## S (a, v)
+#define make_atomic_load_body(S)       \
+  ret= 0; /* avoid compiler warning */ \
+  ret= IL_COMP_EXCHG ## S (a, ret, ret);
+#endif
+/*
+  my_yield_processor (equivalent of x86 PAUSE instruction) should be used
+  to improve performance on hyperthreaded CPUs. Intel recommends to use it in
+  spin loops also on non-HT machines to reduce power consumption (see e.g 
+  http://softwarecommunity.intel.com/articles/eng/2004.htm)
+
+  Running benchmarks for spinlocks implemented with InterlockedCompareExchange
+  and YieldProcessor shows that much better performance is achieved by calling
+  YieldProcessor in a loop - that is, yielding longer. On Intel boxes setting
+  loop count in the range 200-300 brought best results.
+ */
+#ifndef YIELD_LOOPS
+#define YIELD_LOOPS 200
+#endif
+
+static __inline int my_yield_processor()
+{
+  int i;
+  for(i=0; i<YIELD_LOOPS; i++)
+  {
+#if (_MSC_VER <= 1310)
+    /* On older compilers YieldProcessor is not available, use inline assembly*/
+    __asm { rep nop }
+#else
+    YieldProcessor();
+#endif
+  }
+  return 1;
+}
+
+#define LF_BACKOFF my_yield_processor()
+#else /* cleanup */
+
+#undef IL_EXCHG_ADD32
+#undef IL_EXCHG_ADD64
+#undef IL_COMP_EXCHG32
+#undef IL_COMP_EXCHG64
+#undef IL_COMP_EXCHGptr
+#undef IL_EXCHG32
+#undef IL_EXCHG64
+#undef IL_EXCHGptr
+
+#endif
diff --git a/include/atomic/nolock.h b/include/atomic/nolock.h
index d7d87167ade..5a0c41d9078 100644
--- a/include/atomic/nolock.h
+++ b/include/atomic/nolock.h
@@ -16,43 +16,46 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-#if defined(__i386__) || defined(_M_IX86) || defined(HAVE_GCC_ATOMIC_BUILTINS)
+#if defined(__i386__) || defined(_MSC_VER) || defined(__x86_64__)   \
+    || defined(HAVE_GCC_ATOMIC_BUILTINS) \
+    || defined(HAVE_SOLARIS_ATOMIC)
 
-#ifdef MY_ATOMIC_MODE_DUMMY
-#  define LOCK ""
-#else
-#  define LOCK "lock"
-#endif
-
-#ifdef HAVE_GCC_ATOMIC_BUILTINS
-#include "gcc_builtins.h"
-#elif __GNUC__
-#include "x86-gcc.h"
-#elif defined(_MSC_VER)
-#include "x86-msvc.h"
-#endif
-
-#elif defined(HAVE_SOLARIS_ATOMIC)
-
-#include "solaris.h"
-
-#endif /* __i386__ || _M_IX86 || HAVE_GCC_ATOMIC_BUILTINS */
-
-#if defined(make_atomic_cas_body) || defined(MY_ATOMICS_MADE)
+#  ifdef MY_ATOMIC_MODE_DUMMY
+#    define LOCK_prefix ""
+#  else
+#    define LOCK_prefix "lock"
+#  endif
 /*
- * We have atomics that require no locking
- */
-#define	MY_ATOMIC_NOLOCK
-
-#ifdef __SUNPRO_C
-/*
- * Sun Studio 12 (and likely earlier) does not accept a typedef struct {}
- */
-typedef char my_atomic_rwlock_t;
-#else
-typedef struct { } my_atomic_rwlock_t;
+  We choose implementation as follows:
+  ------------------------------------
+  On Windows using Visual C++ the native implementation should be
+  preferrable. When using gcc we prefer the native x86 implementation,
+  we prefer the Solaris implementation before the gcc because of
+  stability preference, we choose gcc implementation if nothing else
+  works on gcc. If neither Visual C++ or gcc we still choose the
+  Solaris implementation on Solaris (mainly for SunStudio compiles.
+*/
+#  if defined(_MSV_VER)
+#    include "generic-msvc.h"
+#  elif __GNUC__
+#    if defined(__i386__) || defined(__x86_64__)
+#      include "x86-gcc.h"
+#    elif defined(HAVE_SOLARIS_ATOMIC)
+#      include "solaris.h"
+#    elif defined(HAVE_GCC_ATOMIC_BUILTINS)
+#      include "gcc_builtins.h"
+#    endif
+#  elif defined(HAVE_SOLARIS_ATOMIC)
+#    include "solaris.h"
+#  endif
 #endif
 
+#if defined(make_atomic_cas_body)
+/*
+  Type not used so minimal size (emptry struct has different size between C
+  and C++, zero-length array is gcc-specific).
+*/
+typedef char my_atomic_rwlock_t __attribute__ ((unused));
 #define my_atomic_rwlock_destroy(name)
 #define my_atomic_rwlock_init(name)
 #define my_atomic_rwlock_rdlock(name)
diff --git a/include/atomic/rwlock.h b/include/atomic/rwlock.h
index 0ff4d16c545..a31f8ed6ca1 100644
--- a/include/atomic/rwlock.h
+++ b/include/atomic/rwlock.h
@@ -1,7 +1,7 @@
 #ifndef ATOMIC_RWLOCK_INCLUDED
 #define ATOMIC_RWLOCK_INCLUDED
 
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006 MySQL AB, 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,7 +16,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
+#define MY_ATOMIC_MODE_RWLOCKS 1
 
 #ifdef MY_ATOMIC_MODE_DUMMY
 /*
@@ -26,6 +26,9 @@ typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
   implementations (another way is to run a UP build on an SMP box).
 */
 #warning MY_ATOMIC_MODE_DUMMY and MY_ATOMIC_MODE_RWLOCKS are incompatible
+
+typedef char my_atomic_rwlock_t;
+
 #define my_atomic_rwlock_destroy(name)
 #define my_atomic_rwlock_init(name)
 #define my_atomic_rwlock_rdlock(name)
@@ -33,18 +36,63 @@ typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
 #define my_atomic_rwlock_rdunlock(name)
 #define my_atomic_rwlock_wrunlock(name)
 #define MY_ATOMIC_MODE "dummy (non-atomic)"
-#else
-#define my_atomic_rwlock_destroy(name)     pthread_rwlock_destroy(& (name)->rw)
-#define my_atomic_rwlock_init(name)        pthread_rwlock_init(& (name)->rw, 0)
-#define my_atomic_rwlock_rdlock(name)      pthread_rwlock_rdlock(& (name)->rw)
-#define my_atomic_rwlock_wrlock(name)      pthread_rwlock_wrlock(& (name)->rw)
-#define my_atomic_rwlock_rdunlock(name)    pthread_rwlock_unlock(& (name)->rw)
-#define my_atomic_rwlock_wrunlock(name)    pthread_rwlock_unlock(& (name)->rw)
-#define MY_ATOMIC_MODE "rwlocks"
+#else /* not MY_ATOMIC_MODE_DUMMY */
+
+typedef struct {pthread_mutex_t rw;} my_atomic_rwlock_t;
+
+#ifndef SAFE_MUTEX
+
+/*
+  we're using read-write lock macros but map them to mutex locks, and they're
+  faster. Still, having semantically rich API we can change the
+  underlying implementation, if necessary.
+*/
+#define my_atomic_rwlock_destroy(name)     pthread_mutex_destroy(& (name)->rw)
+#define my_atomic_rwlock_init(name)        pthread_mutex_init(& (name)->rw, 0)
+#define my_atomic_rwlock_rdlock(name)      pthread_mutex_lock(& (name)->rw)
+#define my_atomic_rwlock_wrlock(name)      pthread_mutex_lock(& (name)->rw)
+#define my_atomic_rwlock_rdunlock(name)    pthread_mutex_unlock(& (name)->rw)
+#define my_atomic_rwlock_wrunlock(name)    pthread_mutex_unlock(& (name)->rw)
+
+#else /* SAFE_MUTEX */
+
+/*
+  SAFE_MUTEX pollutes the compiling name space with macros
+  that alter pthread_mutex_t, pthread_mutex_init, etc.
+  Atomic operations should never use the safe mutex wrappers.
+  Unfortunately, there is no way to have both:
+  - safe mutex macros expanding pthread_mutex_lock to safe_mutex_lock
+  - my_atomic macros expanding to unmodified pthread_mutex_lock
+  inlined in the same compilation unit.
+  So, in case of SAFE_MUTEX, a function call is required.
+  Given that SAFE_MUTEX is a debugging facility,
+  this extra function call is not a performance concern for
+  production builds.
+*/
+C_MODE_START
+extern void plain_pthread_mutex_init(safe_mutex_t *);
+extern void plain_pthread_mutex_destroy(safe_mutex_t *);
+extern void plain_pthread_mutex_lock(safe_mutex_t *);
+extern void plain_pthread_mutex_unlock(safe_mutex_t *);
+C_MODE_END
+
+#define my_atomic_rwlock_destroy(name)     plain_pthread_mutex_destroy(&(name)->rw)
+#define my_atomic_rwlock_init(name)        plain_pthread_mutex_init(&(name)->rw)
+#define my_atomic_rwlock_rdlock(name)      plain_pthread_mutex_lock(&(name)->rw)
+#define my_atomic_rwlock_wrlock(name)      plain_pthread_mutex_lock(&(name)->rw)
+#define my_atomic_rwlock_rdunlock(name)    plain_pthread_mutex_unlock(&(name)->rw)
+#define my_atomic_rwlock_wrunlock(name)    plain_pthread_mutex_unlock(&(name)->rw)
+
+#endif /* SAFE_MUTEX */
+
+#define MY_ATOMIC_MODE "mutex"
+#ifndef MY_ATOMIC_MODE_RWLOCKS
+#define MY_ATOMIC_MODE_RWLOCKS 1
+#endif
 #endif
 
 #define make_atomic_add_body(S)     int ## S sav; sav= *a; *a+= v; v=sav;
-#define make_atomic_swap_body(S)    int ## S sav; sav= *a; *a= v; v=sav;
+#define make_atomic_fas_body(S)     int ## S sav; sav= *a; *a= v; v=sav;
 #define make_atomic_cas_body(S)     if ((ret= (*a == *cmp))) *a= set; else *cmp=*a;
 #define make_atomic_load_body(S)    ret= *a;
 #define make_atomic_store_body(S)   *a= v;
diff --git a/include/atomic/solaris.h b/include/atomic/solaris.h
index 4c51253d2d5..fc9f369c707 100644
--- a/include/atomic/solaris.h
+++ b/include/atomic/solaris.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008 MySQL AB
+/* Copyright (C) 2008 MySQL AB, 2009 Sun Microsystems, Inc
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,198 +13,54 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+#ifndef _atomic_h_cleanup_
+#define _atomic_h_cleanup_ "atomic/solaris.h"
+
 #include <atomic.h>
 
 #define	MY_ATOMIC_MODE	"solaris-atomic"
 
-/*
- * This is defined to indicate we fully define the my_atomic_* (inline)
- * functions here, so there is no need to "make" them in my_atomic.h
- * using make_atomic_* and make_atomic_*_body.
- */
-#define	MY_ATOMICS_MADE
-
-STATIC_INLINE int
-my_atomic_cas8(int8 volatile *a, int8 *cmp, int8 set)
-{
-	int ret;
-	int8 sav;
-	sav = (int8) atomic_cas_8((volatile uint8_t *)a, (uint8_t)*cmp,
-		(uint8_t)set);
-	if (! (ret = (sav == *cmp)))
-		*cmp = sav;
-	return ret;
-}
-
-STATIC_INLINE int
-my_atomic_cas16(int16 volatile *a, int16 *cmp, int16 set)
-{
-	int ret;
-	int16 sav;
-	sav = (int16) atomic_cas_16((volatile uint16_t *)a, (uint16_t)*cmp,
-		(uint16_t)set);
-	if (! (ret = (sav == *cmp)))
-		*cmp = sav;
-	return ret;
-}
-
-STATIC_INLINE int
-my_atomic_cas32(int32 volatile *a, int32 *cmp, int32 set)
-{
-	int ret;
-	int32 sav;
-	sav = (int32) atomic_cas_32((volatile uint32_t *)a, (uint32_t)*cmp,
-		(uint32_t)set);
-	if (! (ret = (sav == *cmp)))
-		*cmp = sav;
-	return ret;
-}
-
-STATIC_INLINE int
-my_atomic_casptr(void * volatile *a, void **cmp, void *set)
-{
-	int ret;
-	void *sav;
-	sav = atomic_cas_ptr(a, *cmp, set);
-	if (! (ret = (sav == *cmp)))
-		*cmp = sav;
-	return ret;
-}
+#define uintptr_t void *
+#define atomic_or_ptr_nv(X,Y) (void *)atomic_or_ulong_nv((volatile ulong_t *)X, Y)
 
-/* ------------------------------------------------------------------------ */
+#define make_atomic_cas_body(S)                         \
+  uint ## S ## _t sav;                                  \
+  sav = atomic_cas_ ## S(                               \
+           (volatile uint ## S ## _t *)a,               \
+           (uint ## S ## _t)*cmp,                       \
+           (uint ## S ## _t)set);                       \
+  if (! (ret= (sav == *cmp)))                           \
+    *cmp= sav;
 
-STATIC_INLINE int8
-my_atomic_add8(int8 volatile *a, int8 v)
-{
-	int8 nv;
-	nv = atomic_add_8_nv((volatile uint8_t *)a, v);
-	return (nv - v);
-}
-
-STATIC_INLINE int16
-my_atomic_add16(int16 volatile *a, int16 v)
-{
-	int16 nv;
-	nv = atomic_add_16_nv((volatile uint16_t *)a, v);
-	return (nv - v);
-}
-
-STATIC_INLINE int32
-my_atomic_add32(int32 volatile *a, int32 v)
-{
-	int32 nv;
-	nv = atomic_add_32_nv((volatile uint32_t *)a, v);
-	return (nv - v);
-}
+#define make_atomic_add_body(S)                         \
+  int ## S nv;  /* new value */                         \
+  nv= atomic_add_ ## S ## _nv((volatile uint ## S ## _t *)a, v); \
+  v= nv - v
 
 /* ------------------------------------------------------------------------ */
 
 #ifdef MY_ATOMIC_MODE_DUMMY
 
-STATIC_INLINE int8
-my_atomic_load8(int8 volatile *a)	{ return (*a); }
-
-STATIC_INLINE int16
-my_atomic_load16(int16 volatile *a)	{ return (*a); }
-
-STATIC_INLINE int32
-my_atomic_load32(int32 volatile *a)	{ return (*a); }
-
-STATIC_INLINE void *
-my_atomic_loadptr(void * volatile *a)	{ return (*a); }
-
-/* ------------------------------------------------------------------------ */
-
-STATIC_INLINE void
-my_atomic_store8(int8 volatile *a, int8 v)	{ *a = v; }
+#define make_atomic_load_body(S)  ret= *a
+#define make_atomic_store_body(S)   *a= v
 
-STATIC_INLINE void
-my_atomic_store16(int16 volatile *a, int16 v)	{ *a = v; }
-
-STATIC_INLINE void
-my_atomic_store32(int32 volatile *a, int32 v)	{ *a = v; }
+#else /* MY_ATOMIC_MODE_DUMMY */
 
-STATIC_INLINE void
-my_atomic_storeptr(void * volatile *a, void *v)	{ *a = v; }
+#define make_atomic_load_body(S)                        \
+  ret= atomic_or_ ## S ## _nv((volatile uint ## S ## _t *)a, 0)
 
-/* ------------------------------------------------------------------------ */
+#define make_atomic_store_body(S)                       \
+  (void) atomic_swap_ ## S((volatile uint ## S ## _t *)a, (uint ## S ## _t)v)
 
-#else /* MY_ATOMIC_MODE_DUMMY */
+#endif
 
-STATIC_INLINE int8
-my_atomic_load8(int8 volatile *a)
-{
-	return ((int8) atomic_or_8_nv((volatile uint8_t *)a, 0));
-}
-
-STATIC_INLINE int16
-my_atomic_load16(int16 volatile *a)
-{
-	return ((int16) atomic_or_16_nv((volatile uint16_t *)a, 0));
-}
-
-STATIC_INLINE int32
-my_atomic_load32(int32 volatile *a)
-{
-	return ((int32) atomic_or_32_nv((volatile uint32_t *)a, 0));
-}
-
-STATIC_INLINE void *
-my_atomic_loadptr(void * volatile *a)
-{
-	return ((void *) atomic_or_ulong_nv((volatile ulong_t *)a, 0));
-}
+#define make_atomic_fas_body(S)                        \
+  v= atomic_swap_ ## S((volatile uint ## S ## _t *)a, (uint ## S ## _t)v)
 
-/* ------------------------------------------------------------------------ */
+#else /* cleanup */
 
-STATIC_INLINE void
-my_atomic_store8(int8 volatile *a, int8 v)
-{
-	(void) atomic_swap_8((volatile uint8_t *)a, (uint8_t)v);
-}
-
-STATIC_INLINE void
-my_atomic_store16(int16 volatile *a, int16 v)
-{
-	(void) atomic_swap_16((volatile uint16_t *)a, (uint16_t)v);
-}
-
-STATIC_INLINE void
-my_atomic_store32(int32 volatile *a, int32 v)
-{
-	(void) atomic_swap_32((volatile uint32_t *)a, (uint32_t)v);
-}
-
-STATIC_INLINE void
-my_atomic_storeptr(void * volatile *a, void *v)
-{
-	(void) atomic_swap_ptr(a, v);
-}
+#undef uintptr_t
+#undef atomic_or_ptr_nv
 
 #endif
 
-/* ------------------------------------------------------------------------ */
-
-STATIC_INLINE int8
-my_atomic_swap8(int8 volatile *a, int8 v)
-{
-	return ((int8) atomic_swap_8((volatile uint8_t *)a, (uint8_t)v));
-}
-
-STATIC_INLINE int16
-my_atomic_swap16(int16 volatile *a, int16 v)
-{
-	return ((int16) atomic_swap_16((volatile uint16_t *)a, (uint16_t)v));
-}
-
-STATIC_INLINE int32
-my_atomic_swap32(int32 volatile *a, int32 v)
-{
-	return ((int32) atomic_swap_32((volatile uint32_t *)a, (uint32_t)v));
-}
-
-STATIC_INLINE void *
-my_atomic_swapptr(void * volatile *a, void *v)
-{
-	return (atomic_swap_ptr(a, v));
-}
diff --git a/include/atomic/x86-gcc.h b/include/atomic/x86-gcc.h
index c3029f9c1b4..61b94a48568 100644
--- a/include/atomic/x86-gcc.h
+++ b/include/atomic/x86-gcc.h
@@ -22,10 +22,24 @@
   architectures support double-word (128-bit) cas.
 */
 
-#ifdef MY_ATOMIC_NO_XADD
-#define MY_ATOMIC_MODE "gcc-x86" LOCK "-no-xadd"
+/*
+  No special support of 8 and 16 bit operations are implemented here
+  currently.
+*/
+#undef MY_ATOMIC_HAS_8_AND_16
+
+#ifdef __x86_64__
+#  ifdef MY_ATOMIC_NO_XADD
+#    define MY_ATOMIC_MODE "gcc-amd64" LOCK_prefix "-no-xadd"
+#  else
+#    define MY_ATOMIC_MODE "gcc-amd64" LOCK_prefix
+#  endif
 #else
-#define MY_ATOMIC_MODE "gcc-x86" LOCK
+#  ifdef MY_ATOMIC_NO_XADD
+#    define MY_ATOMIC_MODE "gcc-x86" LOCK_prefix "-no-xadd"
+#  else
+#    define MY_ATOMIC_MODE "gcc-x86" LOCK_prefix
+#  endif
 #endif
 
 /* fix -ansi errors while maintaining readability */
@@ -34,29 +48,79 @@
 #endif
 
 #ifndef MY_ATOMIC_NO_XADD
-#define make_atomic_add_body(S)					\
-  asm volatile (LOCK "; xadd %0, %1;" : "+r" (v) , "+m" (*a))
+#define make_atomic_add_body(S)         make_atomic_add_body ## S
+#define make_atomic_cas_body(S)         make_atomic_cas_body ## S
 #endif
-#define make_atomic_swap_body(S)				\
-  asm volatile ("; xchg %0, %1;" : "+q" (v) , "+m" (*a))
-#define make_atomic_cas_body(S)					\
-  asm volatile (LOCK "; cmpxchg %3, %0; setz %2;"		\
+
+#define make_atomic_add_body32                                  \
+  asm volatile (LOCK_prefix "; xadd %0, %1;" : "+r" (v) , "+m" (*a))
+
+#define make_atomic_cas_body32                                  \
+  asm volatile (LOCK_prefix "; cmpxchg %3, %0; setz %2;"	\
                : "+m" (*a), "+a" (*cmp), "=q" (ret): "r" (set))
 
-#ifdef MY_ATOMIC_MODE_DUMMY
-#define make_atomic_load_body(S)   ret=*a
-#define make_atomic_store_body(S)  *a=v
-#else
+#ifdef __x86_64__
+#define make_atomic_add_body64 make_atomic_add_body32
+#define make_atomic_cas_body64 make_atomic_cas_body32
+
+#define make_atomic_fas_body(S)                                        \
+  asm volatile ("xchg %0, %1;" : "+r" (v) , "+m" (*a))
+
 /*
   Actually 32-bit reads/writes are always atomic on x86
-  But we add LOCK here anyway to force memory barriers
+  But we add LOCK_prefix here anyway to force memory barriers
+*/
+#define make_atomic_load_body(S)                                \
+  ret=0;                                                        \
+  asm volatile (LOCK_prefix "; cmpxchg %2, %0"                  \
+                : "+m" (*a), "+a" (ret): "r" (ret))
+#define make_atomic_store_body(S)                               \
+  asm volatile ("; xchg %0, %1;" : "+m" (*a), "+r" (v))
+
+#else
+/*
+  Use default implementations of 64-bit operations since we solved
+  the 64-bit problem on 32-bit platforms for CAS, no need to solve it
+  once more for ADD, LOAD, STORE and FAS as well.
+  Since we already added add32 support, we need to define add64
+  here, but we haven't defined fas, load and store at all, so
+  we can fallback on default implementations.
 */
-#define make_atomic_load_body(S)				\
-  ret=0;							\
-  asm volatile (LOCK "; cmpxchg %2, %0"				\
-               : "+m" (*a), "+a" (ret): "r" (ret))
-#define make_atomic_store_body(S)				\
-  asm volatile ("; xchg %0, %1;" : "+m" (*a) : "r" (v))
+#define make_atomic_add_body64                                  \
+  int64 tmp=*a;                                                 \
+  while (!my_atomic_cas64(a, &tmp, tmp+v)) ;                    \
+  v=tmp;
+
+/*
+  On some platforms (e.g. Mac OS X and Solaris) the ebx register
+  is held as a pointer to the global offset table. Thus we're not
+  allowed to use the b-register on those platforms when compiling
+  PIC code, to avoid this we push ebx and pop ebx and add a movl
+  instruction to avoid having ebx in the interface of the assembler
+  instruction.
+
+  cmpxchg8b works on both 32-bit platforms and 64-bit platforms but
+  the code here is only used on 32-bit platforms, on 64-bit
+  platforms the much simpler make_atomic_cas_body32 will work
+  fine.
+*/
+#define make_atomic_cas_body64                                  \
+  int32 ebx=(set & 0xFFFFFFFF), ecx=(set >> 32);                \
+  asm volatile ("push %%ebx; movl %3, %%ebx;"                   \
+                LOCK_prefix "; cmpxchg8b %0; setz %2; pop %%ebx"\
+               : "+m" (*a), "+A" (*cmp), "=c" (ret)             \
+               :"m" (ebx), "c" (ecx))
 #endif
 
+/*
+  The implementation of make_atomic_cas_body32 is adaptable to
+  the OS word size, so on 64-bit platforms it will automatically
+  adapt to 64-bits and so it will work also on 64-bit platforms
+*/
+#define make_atomic_cas_bodyptr make_atomic_cas_body32
+
+#ifdef MY_ATOMIC_MODE_DUMMY
+#define make_atomic_load_body(S)   ret=*a
+#define make_atomic_store_body(S)  *a=v
+#endif
 #endif /* ATOMIC_X86_GCC_INCLUDED */
diff --git a/include/atomic/x86-msvc.h b/include/atomic/x86-msvc.h
deleted file mode 100644
index c4885bb8451..00000000000
--- a/include/atomic/x86-msvc.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (C) 2006 MySQL AB
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; version 2 of the License.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
-
-/*
-  XXX 64-bit atomic operations can be implemented using
-  cmpxchg8b, if necessary
-*/
-
-// Would it be better to use intrinsics ?
-// (InterlockedCompareExchange, InterlockedCompareExchange16
-// InterlockedExchangeAdd, InterlockedExchange)
-
-#ifndef _atomic_h_cleanup_
-#define _atomic_h_cleanup_ "atomic/x86-msvc.h"
-
-#define MY_ATOMIC_MODE "msvc-x86" LOCK
-
-#define make_atomic_add_body(S)				\
-  _asm {						\
-    _asm mov   reg_ ## S, v				\
-    _asm LOCK  xadd *a, reg_ ## S			\
-    _asm movzx v, reg_ ## S				\
-  }
-#define make_atomic_cas_body(S)				\
-  _asm {						\
-    _asm mov    areg_ ## S, *cmp			\
-    _asm mov    reg2_ ## S, set				\
-    _asm LOCK cmpxchg *a, reg2_ ## S			\
-    _asm mov    *cmp, areg_ ## S			\
-    _asm setz   al					\
-    _asm movzx  ret, al					\
-  }
-#define make_atomic_swap_body(S)			\
-  _asm {						\
-    _asm mov    reg_ ## S, v				\
-    _asm xchg   *a, reg_ ## S				\
-    _asm mov    v, reg_ ## S				\
-  }
-
-#ifdef MY_ATOMIC_MODE_DUMMY
-#define make_atomic_load_body(S)        ret=*a
-#define make_atomic_store_body(S)       *a=v
-#else
-/*
-  Actually 32-bit reads/writes are always atomic on x86
-  But we add LOCK here anyway to force memory barriers
-*/
-#define make_atomic_load_body(S)			\
-  _asm {						\
-    _asm mov    areg_ ## S, 0				\
-    _asm mov    reg2_ ## S, areg_ ## S			\
-    _asm LOCK cmpxchg *a, reg2_ ## S			\
-    _asm mov    ret, areg_ ## S				\
-  }
-#define make_atomic_store_body(S)			\
-  _asm {						\
-    _asm mov    reg_ ## S, v				\
-    _asm xchg   *a, reg_ ## S				\
-  }
-#endif
-
-#define reg_8           al
-#define reg_16          ax
-#define reg_32          eax
-#define areg_8          al
-#define areg_16         ax
-#define areg_32         eax
-#define reg2_8          bl
-#define reg2_16         bx
-#define reg2_32         ebx
-
-#else /* cleanup */
-
-#undef reg_8
-#undef reg_16
-#undef reg_32
-#undef areg_8
-#undef areg_16
-#undef areg_32
-#undef reg2_8
-#undef reg2_16
-#undef reg2_32
-#endif
-