35 files changed, 2551 insertions, 404 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e53ff30159..7a5ee9223bc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2006 MySQL AB
+# Copyright (C) 2006 MySQL AB, 2009 Sun Microsystems, Inc
 # 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -284,10 +284,16 @@ CONFIGURE_FILE(${CMAKE_SOURCE_DIR}/sql/sql_builtin.cc.in
 SET (ENGINE_BUILD_TYPE "STATIC")
 FOREACH(DIR ${STATIC_ENGINE_DIRECTORIES})
   ADD_SUBDIRECTORY(${DIR})
+  IF(EXISTS ${DIR}/unittest)
+        ADD_SUBDIRECTORY(${DIR}/unittest)
+  ENDIF(EXISTS ${DIR}/unittest)
 ENDFOREACH(DIR ${STATIC_ENGINE_DIRECTORIES})
 
 SET (ENGINE_BUILD_TYPE "DYNAMIC")
 FOREACH(DIR ${DYNAMIC_ENGINE_DIRECTORIES})
+  IF(EXISTS ${DIR}/unittest)
+        ADD_SUBDIRECTORY(${DIR}/unittest)
+  ENDIF(EXISTS ${DIR}/unittest)
   ADD_SUBDIRECTORY(${DIR})
 ENDFOREACH(DIR ${DYNAMIC_ENGINE_DIRECTORIES})
 
@@ -313,6 +319,9 @@ ADD_SUBDIRECTORY(sql)
 ADD_SUBDIRECTORY(libmysql)
 ADD_SUBDIRECTORY(libservices)
 ADD_SUBDIRECTORY(tests)
+ADD_SUBDIRECTORY(unittest/mytap)
+ADD_SUBDIRECTORY(unittest/examples)
+ADD_SUBDIRECTORY(unittest/mysys)
 IF(WITH_EMBEDDED_SERVER) 
   ADD_SUBDIRECTORY(libmysqld)
   ADD_SUBDIRECTORY(libmysqld/examples)
diff --git a/configure.in b/configure.in
index aeebb2c5796..361d117b2dc 100644
--- a/configure.in
+++ b/configure.in
@@ -1750,64 +1750,74 @@ then
 fi
 
 AC_ARG_WITH([atomic-ops],
-	    AC_HELP_STRING([--with-atomic-ops=rwlocks|smp|up],
-	    [Implement atomic operations using pthread rwlocks or atomic CPU
-             instructions for multi-processor (default) or uniprocessor
-             configuration]), , [with_atomic_ops=smp])
+            AS_HELP_STRING([--with-atomic-ops=rwlocks|smp|up],
+            [Implement atomic operations using pthread rwlocks or atomic CPU
+             instructions for multi-processor or uniprocessor
+             configuration. By default gcc built-in sync functions are used,
+             if available and 'smp' configuration otherwise.]))
 case "$with_atomic_ops" in
   "up") AC_DEFINE([MY_ATOMIC_MODE_DUMMY], [1],
                   [Assume single-CPU mode, no concurrency]) ;;
   "rwlocks") AC_DEFINE([MY_ATOMIC_MODE_RWLOCKS], [1],
                   [Use pthread rwlocks for atomic ops]) ;;
   "smp") ;;
+  "")
+   ;;
    *) AC_MSG_ERROR(["$with_atomic_ops" is not a valid value for --with-atomic-ops]) ;;
 esac
 
 AC_CACHE_CHECK([whether the compiler provides atomic builtins],
-               [mysql_cv_gcc_atomic_builtins], [AC_TRY_RUN([
-  int main()
-  {
-    int foo= -10; int bar= 10;
-    if (!__sync_fetch_and_add(&foo, bar) || foo)
-      return -1;
-    bar= __sync_lock_test_and_set(&foo, bar);
-    if (bar || foo != 10)
-      return -1;
-    bar= __sync_val_compare_and_swap(&bar, foo, 15);
-    if (bar)
-      return -1;
-    return 0;
-  }
-], [mysql_cv_gcc_atomic_builtins=yes],
+               [mysql_cv_gcc_atomic_builtins],
+  [AC_RUN_IFELSE(
+     [AC_LANG_PROGRAM(
+        [
+        ],
+     [[
+        int foo= -10; int bar= 10;
+        if (!__sync_fetch_and_add(&foo, bar) || foo)
+          return -1;
+        bar= __sync_lock_test_and_set(&foo, bar);
+        if (bar || foo != 10)
+          return -1;
+        bar= __sync_val_compare_and_swap(&bar, foo, 15);
+        if (bar)
+          return -1;
+        return 0;
+     ]]
+     )],
+   [mysql_cv_gcc_atomic_builtins=yes],
    [mysql_cv_gcc_atomic_builtins=no],
-   [mysql_cv_gcc_atomic_builtins=no])])
-
+   [mysql_cv_gcc_atomic_builtins=no]
+)])
 if test "x$mysql_cv_gcc_atomic_builtins" = xyes; then
   AC_DEFINE(HAVE_GCC_ATOMIC_BUILTINS, 1,
             [Define to 1 if compiler provides atomic builtins.])
 fi
 
 AC_CACHE_CHECK([whether the OS provides atomic_* functions like Solaris],
-               [mysql_cv_solaris_atomic], [AC_TRY_RUN([
-#include <atomic.h>
-int
-main()
-{
-	int foo = -10; int bar = 10;
-	if (atomic_add_int_nv((uint_t *)&foo, bar) || foo)
-		return -1;
-	bar = atomic_swap_uint((uint_t *)&foo, (uint_t)bar);
-	if (bar || foo != 10)
-		return -1;
-	bar = atomic_cas_uint((uint_t *)&bar, (uint_t)foo, 15);
-	if (bar)
-		return -1;
-	return 0;
-}
-], [mysql_cv_solaris_atomic=yes],
+               [mysql_cv_solaris_atomic],
+  [AC_RUN_IFELSE(
+     [AC_LANG_PROGRAM(
+        [
+        #include <atomic.h>
+        ]
+     [[
+        int foo = -10; int bar = 10;
+        if (atomic_add_int_nv((uint_t *)&foo, bar) || foo)
+          return -1;
+        bar = atomic_swap_uint((uint_t *)&foo, (uint_t)bar);
+        if (bar || foo != 10)
+          return -1;
+        bar = atomic_cas_uint((uint_t *)&bar, (uint_t)foo, 15);
+        if (bar)
+          return -1;
+        return 0;
+     ]]
+     )],
+   [mysql_cv_solaris_atomic=yes],
    [mysql_cv_solaris_atomic=no],
-   [mysql_cv_solaris_atomic=no])])
-
+   [mysql_cv_solaris_atomic=no]
+)])
 if test "x$mysql_cv_solaris_atomic" = xyes; then
   AC_DEFINE(HAVE_SOLARIS_ATOMIC, 1,
             [Define to 1 if OS provides atomic_* functions like Solaris.])
@@ -2102,7 +2112,7 @@ AC_CHECK_FUNCS(alarm bcmp bfill bmove bsearch bzero \
   pthread_getsequence_np pthread_key_delete pthread_rwlock_rdlock pthread_sigmask \
   readlink realpath rename rint rwlock_init setupterm \
   shmget shmat shmdt shmctl sigaction sigemptyset sigaddset \
-  sighold sigset sigthreadmask port_create sleep \
+  sighold sigset sigthreadmask port_create sleep thr_yield \
   snprintf socket stpcpy strcasecmp strerror strsignal strnlen strpbrk strstr \
   strtol strtoll strtoul strtoull tell tempnam thr_setconcurrency vidattr \
   posix_fallocate backtrace backtrace_symbols backtrace_symbols_fd)
diff --git a/include/Makefile.am b/include/Makefile.am
index 0a7a35bd6fa..83a22f1beec 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -30,7 +30,7 @@ pkginclude_HEADERS =	$(HEADERS_ABI) my_dbug.h m_string.h my_sys.h \
 			m_ctype.h my_attribute.h $(HEADERS_GEN_CONFIGURE) \
 			$(HEADERS_GEN_MAKE) probes_mysql.h probes_mysql_nodtrace.h
 
-noinst_HEADERS =	config-win.h config-netware.h my_bit.h \
+noinst_HEADERS =	config-win.h config-netware.h lf.h my_bit.h \
 			heap.h my_bitmap.h my_uctype.h \
 			myisam.h myisampack.h myisammrg.h ft_global.h\
 			mysys_err.h my_base.h help_start.h help_end.h \
@@ -39,9 +39,9 @@ noinst_HEADERS =	config-win.h config-netware.h my_bit.h \
 			thr_lock.h t_ctype.h violite.h my_md5.h base64.h \
 			my_handler.h my_time.h service_versions.h \
 			my_vle.h my_user.h my_atomic.h atomic/nolock.h \
-			atomic/rwlock.h atomic/x86-gcc.h atomic/x86-msvc.h \
-                        atomic/solaris.h \
-			atomic/gcc_builtins.h my_libwrap.h my_stacktrace.h
+			atomic/rwlock.h atomic/x86-gcc.h atomic/generic-msvc.h \
+			atomic/gcc_builtins.h my_libwrap.h my_stacktrace.h \
+			atomic/solaris.h
 
 EXTRA_DIST =        mysql.h.pp mysql/plugin.h.pp probes_mysql.d.base 
 
diff --git a/include/atomic/gcc_builtins.h b/include/atomic/gcc_builtins.h
index 01ebc38707e..100ff80cacd 100644
--- a/include/atomic/gcc_builtins.h
+++ b/include/atomic/gcc_builtins.h
@@ -18,7 +18,7 @@
 
 #define make_atomic_add_body(S)                     \
   v= __sync_fetch_and_add(a, v);
-#define make_atomic_swap_body(S)                    \
+#define make_atomic_fas_body(S)                     \
   v= __sync_lock_test_and_set(a, v);
 #define make_atomic_cas_body(S)                     \
   int ## S sav;                                     \
@@ -28,7 +28,10 @@
 #ifdef MY_ATOMIC_MODE_DUMMY
 #define make_atomic_load_body(S)   ret= *a
 #define make_atomic_store_body(S)  *a= v
+#define MY_ATOMIC_MODE "gcc-builtins-up"
+
 #else
+#define MY_ATOMIC_MODE "gcc-builtins-smp"
 #define make_atomic_load_body(S)                    \
   ret= __sync_fetch_and_or(a, 0);
 #define make_atomic_store_body(S)                   \
diff --git a/include/atomic/generic-msvc.h b/include/atomic/generic-msvc.h
new file mode 100644
index 00000000000..f1e1b0e88c9
--- /dev/null
+++ b/include/atomic/generic-msvc.h
@@ -0,0 +1,116 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _atomic_h_cleanup_
+#define _atomic_h_cleanup_ "atomic/generic-msvc.h"
+
+/*
+  We don't implement anything specific for MY_ATOMIC_MODE_DUMMY, always use
+  intrinsics.
+  8 and 16-bit atomics are not implemented, but it can be done if necessary.
+*/
+#undef MY_ATOMIC_HAS_8_16
+
+/*
+  x86 compilers (both VS2003 or VS2005) never use instrinsics, but generate 
+  function calls to kernel32 instead, even in the optimized build. 
+  We force intrinsics as described in MSDN documentation for 
+  _InterlockedCompareExchange.
+*/
+#ifdef _M_IX86
+
+#if (_MSC_VER >= 1500)
+#include <intrin.h>
+#else
+C_MODE_START
+/*Visual Studio 2003 and earlier do not have prototypes for atomic intrinsics*/
+LONG _InterlockedExchange (LONG volatile *Target,LONG Value);
+LONG _InterlockedCompareExchange (LONG volatile *Target, LONG Value, LONG Comp);
+LONG _InterlockedExchangeAdd (LONG volatile *Addend, LONG Value);
+C_MODE_END
+
+#pragma intrinsic(_InterlockedExchangeAdd)
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchange)
+#endif
+
+#define InterlockedExchange        _InterlockedExchange
+#define InterlockedExchangeAdd     _InterlockedExchangeAdd
+#define InterlockedCompareExchange _InterlockedCompareExchange
+/*
+ No need to do something special for InterlockedCompareExchangePointer
+ as it is a #define to InterlockedCompareExchange. The same applies to
+ InterlockedExchangePointer. 
+*/
+#endif /*_M_IX86*/
+
+#define MY_ATOMIC_MODE "msvc-intrinsics"
+#define IL_EXCHG_ADD32(X,Y)     InterlockedExchangeAdd((volatile LONG *)(X),(Y))
+#define IL_COMP_EXCHG32(X,Y,Z)  InterlockedCompareExchange((volatile LONG *)(X),(Y),(Z))
+#define IL_COMP_EXCHGptr        InterlockedCompareExchangePointer
+#define IL_EXCHG32(X,Y)         InterlockedExchange((volatile LONG *)(X),(Y))
+#define IL_EXCHGptr             InterlockedExchangePointer
+#define make_atomic_add_body(S) \
+  v= IL_EXCHG_ADD ## S (a, v)
+#define make_atomic_cas_body(S)                                 \
+  int ## S initial_cmp= *cmp;                                   \
+  int ## S initial_a= IL_COMP_EXCHG ## S (a, set, initial_cmp); \
+  if (!(ret= (initial_a == initial_cmp))) *cmp= initial_a;
+#define make_atomic_swap_body(S) \
+  v= IL_EXCHG ## S (a, v)
+#define make_atomic_load_body(S)       \
+  ret= 0; /* avoid compiler warning */ \
+  ret= IL_COMP_EXCHG ## S (a, ret, ret);
+
+/*
+  my_yield_processor (equivalent of x86 PAUSE instruction) should be used
+  to improve performance on hyperthreaded CPUs. Intel recommends to use it in
+  spin loops also on non-HT machines to reduce power consumption (see e.g 
+  http://softwarecommunity.intel.com/articles/eng/2004.htm)
+
+  Running benchmarks for spinlocks implemented with InterlockedCompareExchange
+  and YieldProcessor shows that much better performance is achieved by calling
+  YieldProcessor in a loop - that is, yielding longer. On Intel boxes setting
+  loop count in the range 200-300 brought best results.
+ */
+#ifndef YIELD_LOOPS
+#define YIELD_LOOPS 200
+#endif
+
+static __inline int my_yield_processor()
+{
+  int i;
+  for(i=0; i<YIELD_LOOPS; i++)
+  {
+#if (_MSC_VER <= 1310)
+    /* On older compilers YieldProcessor is not available, use inline assembly*/
+    __asm { rep nop }
+#else
+    YieldProcessor();
+#endif
+  }
+  return 1;
+}
+
+#define LF_BACKOFF my_yield_processor()
+#else /* cleanup */
+
+#undef IL_EXCHG_ADD32
+#undef IL_COMP_EXCHG32
+#undef IL_COMP_EXCHGptr
+#undef IL_EXCHG32
+#undef IL_EXCHGptr
+
+#endif
diff --git a/include/atomic/nolock.h b/include/atomic/nolock.h
index d7d87167ade..e4cd9ab9896 100644
--- a/include/atomic/nolock.h
+++ b/include/atomic/nolock.h
@@ -16,43 +16,36 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-#if defined(__i386__) || defined(_M_IX86) || defined(HAVE_GCC_ATOMIC_BUILTINS)
-
-#ifdef MY_ATOMIC_MODE_DUMMY
-#  define LOCK ""
-#else
-#  define LOCK "lock"
-#endif
-
-#ifdef HAVE_GCC_ATOMIC_BUILTINS
-#include "gcc_builtins.h"
-#elif __GNUC__
-#include "x86-gcc.h"
-#elif defined(_MSC_VER)
-#include "x86-msvc.h"
-#endif
-
+#if defined(__i386__) || defined(_MSC_VER) || defined(__x86_64__)   \
+    || defined(HAVE_GCC_ATOMIC_BUILTINS)
+
+#  ifdef MY_ATOMIC_MODE_DUMMY
+#    define LOCK_prefix ""
+#  else
+#    define LOCK_prefix "lock"
+#  endif
+
+#  ifdef HAVE_GCC_ATOMIC_BUILTINS
+#    include "gcc_builtins.h"
+#  elif __GNUC__
+#    include "x86-gcc.h"
+#  elif defined(_MSC_VER)
+#    include "generic-msvc.h"
+#  endif
 #elif defined(HAVE_SOLARIS_ATOMIC)
-
 #include "solaris.h"
-
-#endif /* __i386__ || _M_IX86 || HAVE_GCC_ATOMIC_BUILTINS */
+#endif
 
 #if defined(make_atomic_cas_body) || defined(MY_ATOMICS_MADE)
 /*
  * We have atomics that require no locking
  */
 #define	MY_ATOMIC_NOLOCK
-
-#ifdef __SUNPRO_C
 /*
- * Sun Studio 12 (and likely earlier) does not accept a typedef struct {}
- */
-typedef char my_atomic_rwlock_t;
-#else
-typedef struct { } my_atomic_rwlock_t;
-#endif
-
+  Type not used so minimal size (emptry struct has different size between C
+  and C++, zero-length array is gcc-specific).
+*/
+typedef char my_atomic_rwlock_t __attribute__ ((unused));
 #define my_atomic_rwlock_destroy(name)
 #define my_atomic_rwlock_init(name)
 #define my_atomic_rwlock_rdlock(name)
diff --git a/include/atomic/rwlock.h b/include/atomic/rwlock.h
index 0ff4d16c545..a31f8ed6ca1 100644
--- a/include/atomic/rwlock.h
+++ b/include/atomic/rwlock.h
@@ -1,7 +1,7 @@
 #ifndef ATOMIC_RWLOCK_INCLUDED
 #define ATOMIC_RWLOCK_INCLUDED
 
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006 MySQL AB, 2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,7 +16,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
+#define MY_ATOMIC_MODE_RWLOCKS 1
 
 #ifdef MY_ATOMIC_MODE_DUMMY
 /*
@@ -26,6 +26,9 @@ typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
   implementations (another way is to run a UP build on an SMP box).
 */
 #warning MY_ATOMIC_MODE_DUMMY and MY_ATOMIC_MODE_RWLOCKS are incompatible
+
+typedef char my_atomic_rwlock_t;
+
 #define my_atomic_rwlock_destroy(name)
 #define my_atomic_rwlock_init(name)
 #define my_atomic_rwlock_rdlock(name)
@@ -33,18 +36,63 @@ typedef struct {pthread_rwlock_t rw;} my_atomic_rwlock_t;
 #define my_atomic_rwlock_rdunlock(name)
 #define my_atomic_rwlock_wrunlock(name)
 #define MY_ATOMIC_MODE "dummy (non-atomic)"
-#else
-#define my_atomic_rwlock_destroy(name)     pthread_rwlock_destroy(& (name)->rw)
-#define my_atomic_rwlock_init(name)        pthread_rwlock_init(& (name)->rw, 0)
-#define my_atomic_rwlock_rdlock(name)      pthread_rwlock_rdlock(& (name)->rw)
-#define my_atomic_rwlock_wrlock(name)      pthread_rwlock_wrlock(& (name)->rw)
-#define my_atomic_rwlock_rdunlock(name)    pthread_rwlock_unlock(& (name)->rw)
-#define my_atomic_rwlock_wrunlock(name)    pthread_rwlock_unlock(& (name)->rw)
-#define MY_ATOMIC_MODE "rwlocks"
+#else /* not MY_ATOMIC_MODE_DUMMY */
+
+typedef struct {pthread_mutex_t rw;} my_atomic_rwlock_t;
+
+#ifndef SAFE_MUTEX
+
+/*
+  we're using read-write lock macros but map them to mutex locks, and they're
+  faster. Still, having semantically rich API we can change the
+  underlying implementation, if necessary.
+*/
+#define my_atomic_rwlock_destroy(name)     pthread_mutex_destroy(& (name)->rw)
+#define my_atomic_rwlock_init(name)        pthread_mutex_init(& (name)->rw, 0)
+#define my_atomic_rwlock_rdlock(name)      pthread_mutex_lock(& (name)->rw)
+#define my_atomic_rwlock_wrlock(name)      pthread_mutex_lock(& (name)->rw)
+#define my_atomic_rwlock_rdunlock(name)    pthread_mutex_unlock(& (name)->rw)
+#define my_atomic_rwlock_wrunlock(name)    pthread_mutex_unlock(& (name)->rw)
+
+#else /* SAFE_MUTEX */
+
+/*
+  SAFE_MUTEX pollutes the compiling name space with macros
+  that alter pthread_mutex_t, pthread_mutex_init, etc.
+  Atomic operations should never use the safe mutex wrappers.
+  Unfortunately, there is no way to have both:
+  - safe mutex macros expanding pthread_mutex_lock to safe_mutex_lock
+  - my_atomic macros expanding to unmodified pthread_mutex_lock
+  inlined in the same compilation unit.
+  So, in case of SAFE_MUTEX, a function call is required.
+  Given that SAFE_MUTEX is a debugging facility,
+  this extra function call is not a performance concern for
+  production builds.
+*/
+C_MODE_START
+extern void plain_pthread_mutex_init(safe_mutex_t *);
+extern void plain_pthread_mutex_destroy(safe_mutex_t *);
+extern void plain_pthread_mutex_lock(safe_mutex_t *);
+extern void plain_pthread_mutex_unlock(safe_mutex_t *);
+C_MODE_END
+
+#define my_atomic_rwlock_destroy(name)     plain_pthread_mutex_destroy(&(name)->rw)
+#define my_atomic_rwlock_init(name)        plain_pthread_mutex_init(&(name)->rw)
+#define my_atomic_rwlock_rdlock(name)      plain_pthread_mutex_lock(&(name)->rw)
+#define my_atomic_rwlock_wrlock(name)      plain_pthread_mutex_lock(&(name)->rw)
+#define my_atomic_rwlock_rdunlock(name)    plain_pthread_mutex_unlock(&(name)->rw)
+#define my_atomic_rwlock_wrunlock(name)    plain_pthread_mutex_unlock(&(name)->rw)
+
+#endif /* SAFE_MUTEX */
+
+#define MY_ATOMIC_MODE "mutex"
+#ifndef MY_ATOMIC_MODE_RWLOCKS
+#define MY_ATOMIC_MODE_RWLOCKS 1
+#endif
 #endif
 
 #define make_atomic_add_body(S)     int ## S sav; sav= *a; *a+= v; v=sav;
-#define make_atomic_swap_body(S)    int ## S sav; sav= *a; *a= v; v=sav;
+#define make_atomic_fas_body(S)     int ## S sav; sav= *a; *a= v; v=sav;
 #define make_atomic_cas_body(S)     if ((ret= (*a == *cmp))) *a= set; else *cmp=*a;
 #define make_atomic_load_body(S)    ret= *a;
 #define make_atomic_store_body(S)   *a= v;
diff --git a/include/atomic/solaris.h b/include/atomic/solaris.h
index 4c51253d2d5..45efd9faaba 100644
--- a/include/atomic/solaris.h
+++ b/include/atomic/solaris.h
@@ -186,25 +186,25 @@ my_atomic_storeptr(void * volatile *a, void *v)
 /* ------------------------------------------------------------------------ */
 
 STATIC_INLINE int8
-my_atomic_swap8(int8 volatile *a, int8 v)
+my_atomic_fas8(int8 volatile *a, int8 v)
 {
 	return ((int8) atomic_swap_8((volatile uint8_t *)a, (uint8_t)v));
 }
 
 STATIC_INLINE int16
-my_atomic_swap16(int16 volatile *a, int16 v)
+my_atomic_fas16(int16 volatile *a, int16 v)
 {
 	return ((int16) atomic_swap_16((volatile uint16_t *)a, (uint16_t)v));
 }
 
 STATIC_INLINE int32
-my_atomic_swap32(int32 volatile *a, int32 v)
+my_atomic_fas32(int32 volatile *a, int32 v)
 {
 	return ((int32) atomic_swap_32((volatile uint32_t *)a, (uint32_t)v));
 }
 
 STATIC_INLINE void *
-my_atomic_swapptr(void * volatile *a, void *v)
+my_atomic_fasptr(void * volatile *a, void *v)
 {
 	return (atomic_swap_ptr(a, v));
 }
diff --git a/include/atomic/x86-gcc.h b/include/atomic/x86-gcc.h
index c3029f9c1b4..59090bc26b7 100644
--- a/include/atomic/x86-gcc.h
+++ b/include/atomic/x86-gcc.h
@@ -22,10 +22,18 @@
   architectures support double-word (128-bit) cas.
 */
 
-#ifdef MY_ATOMIC_NO_XADD
-#define MY_ATOMIC_MODE "gcc-x86" LOCK "-no-xadd"
+#ifdef __x86_64__
+#  ifdef MY_ATOMIC_NO_XADD
+#    define MY_ATOMIC_MODE "gcc-amd64" LOCK_prefix "-no-xadd"
+#  else
+#    define MY_ATOMIC_MODE "gcc-amd64" LOCK_prefix
+#  endif
 #else
-#define MY_ATOMIC_MODE "gcc-x86" LOCK
+#  ifdef MY_ATOMIC_NO_XADD
+#    define MY_ATOMIC_MODE "gcc-x86" LOCK_prefix "-no-xadd"
+#  else
+#    define MY_ATOMIC_MODE "gcc-x86" LOCK_prefix
+#  endif
 #endif
 
 /* fix -ansi errors while maintaining readability */
@@ -35,12 +43,12 @@
 
 #ifndef MY_ATOMIC_NO_XADD
 #define make_atomic_add_body(S)					\
-  asm volatile (LOCK "; xadd %0, %1;" : "+r" (v) , "+m" (*a))
+  asm volatile (LOCK_prefix "; xadd %0, %1;" : "+r" (v) , "+m" (*a))
 #endif
-#define make_atomic_swap_body(S)				\
-  asm volatile ("; xchg %0, %1;" : "+q" (v) , "+m" (*a))
+#define make_atomic_fas_body(S)				\
+  asm volatile ("xchg %0, %1;" : "+q" (v) , "+m" (*a))
 #define make_atomic_cas_body(S)					\
-  asm volatile (LOCK "; cmpxchg %3, %0; setz %2;"		\
+  asm volatile (LOCK_prefix "; cmpxchg %3, %0; setz %2;"	\
                : "+m" (*a), "+a" (*cmp), "=q" (ret): "r" (set))
 
 #ifdef MY_ATOMIC_MODE_DUMMY
@@ -49,14 +57,14 @@
 #else
 /*
   Actually 32-bit reads/writes are always atomic on x86
-  But we add LOCK here anyway to force memory barriers
+  But we add LOCK_prefix here anyway to force memory barriers
 */
 #define make_atomic_load_body(S)				\
   ret=0;							\
-  asm volatile (LOCK "; cmpxchg %2, %0"				\
+  asm volatile (LOCK_prefix "; cmpxchg %2, %0"			\
                : "+m" (*a), "+a" (ret): "r" (ret))
 #define make_atomic_store_body(S)				\
-  asm volatile ("; xchg %0, %1;" : "+m" (*a) : "r" (v))
+  asm volatile ("; xchg %0, %1;" : "+m" (*a), "+r" (v))
 #endif
 
 #endif /* ATOMIC_X86_GCC_INCLUDED */
diff --git a/include/atomic/x86-msvc.h b/include/atomic/x86-msvc.h
deleted file mode 100644
index c4885bb8451..00000000000
--- a/include/atomic/x86-msvc.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (C) 2006 MySQL AB
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; version 2 of the License.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
-
-/*
-  XXX 64-bit atomic operations can be implemented using
-  cmpxchg8b, if necessary
-*/
-
-// Would it be better to use intrinsics ?
-// (InterlockedCompareExchange, InterlockedCompareExchange16
-// InterlockedExchangeAdd, InterlockedExchange)
-
-#ifndef _atomic_h_cleanup_
-#define _atomic_h_cleanup_ "atomic/x86-msvc.h"
-
-#define MY_ATOMIC_MODE "msvc-x86" LOCK
-
-#define make_atomic_add_body(S)				\
-  _asm {						\
-    _asm mov   reg_ ## S, v				\
-    _asm LOCK  xadd *a, reg_ ## S			\
-    _asm movzx v, reg_ ## S				\
-  }
-#define make_atomic_cas_body(S)				\
-  _asm {						\
-    _asm mov    areg_ ## S, *cmp			\
-    _asm mov    reg2_ ## S, set				\
-    _asm LOCK cmpxchg *a, reg2_ ## S			\
-    _asm mov    *cmp, areg_ ## S			\
-    _asm setz   al					\
-    _asm movzx  ret, al					\
-  }
-#define make_atomic_swap_body(S)			\
-  _asm {						\
-    _asm mov    reg_ ## S, v				\
-    _asm xchg   *a, reg_ ## S				\
-    _asm mov    v, reg_ ## S				\
-  }
-
-#ifdef MY_ATOMIC_MODE_DUMMY
-#define make_atomic_load_body(S)        ret=*a
-#define make_atomic_store_body(S)       *a=v
-#else
-/*
-  Actually 32-bit reads/writes are always atomic on x86
-  But we add LOCK here anyway to force memory barriers
-*/
-#define make_atomic_load_body(S)			\
-  _asm {						\
-    _asm mov    areg_ ## S, 0				\
-    _asm mov    reg2_ ## S, areg_ ## S			\
-    _asm LOCK cmpxchg *a, reg2_ ## S			\
-    _asm mov    ret, areg_ ## S				\
-  }
-#define make_atomic_store_body(S)			\
-  _asm {						\
-    _asm mov    reg_ ## S, v				\
-    _asm xchg   *a, reg_ ## S				\
-  }
-#endif
-
-#define reg_8           al
-#define reg_16          ax
-#define reg_32          eax
-#define areg_8          al
-#define areg_16         ax
-#define areg_32         eax
-#define reg2_8          bl
-#define reg2_16         bx
-#define reg2_32         ebx
-
-#else /* cleanup */
-
-#undef reg_8
-#undef reg_16
-#undef reg_32
-#undef areg_8
-#undef areg_16
-#undef areg_32
-#undef reg2_8
-#undef reg2_16
-#undef reg2_32
-#endif
-
diff --git a/include/lf.h b/include/lf.h
new file mode 100644
index 00000000000..7e8f05f4ada
--- /dev/null
+++ b/include/lf.h
@@ -0,0 +1,268 @@
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _lf_h
+#define _lf_h
+
+#include <my_atomic.h>
+
+C_MODE_START
+
+/*
+  Helpers to define both func() and _func(), where
+  func() is a _func() protected by my_atomic_rwlock_wrlock()
+*/
+
+#define lock_wrap(f, t, proto_args, args, lock) \
+t _ ## f proto_args;                            \
+static inline t f  proto_args                   \
+{                                               \
+  t ret;                                        \
+  my_atomic_rwlock_wrlock(lock);                \
+  ret= _ ## f args;                             \
+  my_atomic_rwlock_wrunlock(lock);              \
+  return ret;                                   \
+}
+
+#define lock_wrap_void(f, proto_args, args, lock) \
+void _ ## f proto_args;                         \
+static inline void f proto_args                 \
+{                                               \
+  my_atomic_rwlock_wrlock(lock);                \
+  _ ## f args;                                  \
+  my_atomic_rwlock_wrunlock(lock);              \
+}
+
+#define nolock_wrap(f, t, proto_args, args)     \
+t _ ## f proto_args;                            \
+static inline t f  proto_args                   \
+{                                               \
+  return _ ## f args;                           \
+}
+
+#define nolock_wrap_void(f, proto_args, args)   \
+void _ ## f proto_args;                         \
+static inline void f proto_args                 \
+{                                               \
+  _ ## f args;                                  \
+}
+
+/*
+  wait-free dynamic array, see lf_dynarray.c
+
+  4 levels of 256 elements each mean 4311810304 elements in an array - it
+  should be enough for a while
+*/
+#define LF_DYNARRAY_LEVEL_LENGTH 256
+#define LF_DYNARRAY_LEVELS       4
+
+typedef struct {
+  void * volatile level[LF_DYNARRAY_LEVELS];
+  uint size_of_element;
+  my_atomic_rwlock_t lock;
+} LF_DYNARRAY;
+
+typedef int (*lf_dynarray_func)(void *, void *);
+
+void lf_dynarray_init(LF_DYNARRAY *array, uint element_size);
+void lf_dynarray_destroy(LF_DYNARRAY *array);
+
+nolock_wrap(lf_dynarray_value, void *,
+            (LF_DYNARRAY *array, uint idx),
+            (array, idx))
+lock_wrap(lf_dynarray_lvalue, void *,
+          (LF_DYNARRAY *array, uint idx),
+          (array, idx),
+          &array->lock)
+nolock_wrap(lf_dynarray_iterate, int,
+            (LF_DYNARRAY *array, lf_dynarray_func func, void *arg),
+            (array, func, arg))
+
+/*
+  pin manager for memory allocator, lf_alloc-pin.c
+*/
+
+#define LF_PINBOX_PINS 4
+#define LF_PURGATORY_SIZE 10
+
+typedef void lf_pinbox_free_func(void *, void *, void*);
+
+typedef struct {
+  LF_DYNARRAY pinarray;
+  lf_pinbox_free_func *free_func;
+  void *free_func_arg;
+  uint free_ptr_offset;
+  uint32 volatile pinstack_top_ver;         /* this is a versioned pointer */
+  uint32 volatile pins_in_array;            /* number of elements in array */
+} LF_PINBOX;
+
+typedef struct {
+  void * volatile pin[LF_PINBOX_PINS];
+  LF_PINBOX *pinbox;
+  void  **stack_ends_here;
+  void  *purgatory;
+  uint32 purgatory_count;
+  uint32 volatile link;
+/* we want sizeof(LF_PINS) to be 64 to avoid false sharing */
+#if SIZEOF_INT*2+SIZEOF_CHARP*(LF_PINBOX_PINS+3) != 64
+  char pad[64-sizeof(uint32)*2-sizeof(void*)*(LF_PINBOX_PINS+3)];
+#endif
+} LF_PINS;
+
+/*
+  shortcut macros to do an atomic_wrlock on a structure that uses pins
+  (e.g. lf_hash).
+*/
+#define lf_rwlock_by_pins(PINS)   \
+  my_atomic_rwlock_wrlock(&(PINS)->pinbox->pinarray.lock)
+#define lf_rwunlock_by_pins(PINS) \
+  my_atomic_rwlock_wrunlock(&(PINS)->pinbox->pinarray.lock)
+
+/*
+  compile-time assert, to require "no less than N" pins
+  it's enough if it'll fail on at least one compiler, so
+  we'll enable it on GCC only, which supports zero-length arrays.
+*/
+#if defined(__GNUC__) && defined(MY_LF_EXTRA_DEBUG)
+#define LF_REQUIRE_PINS(N)                                      \
+  static const char require_pins[LF_PINBOX_PINS-N]              \
+                             __attribute__ ((unused));          \
+  static const int LF_NUM_PINS_IN_THIS_FILE= N;
+#define _lf_pin(PINS, PIN, ADDR)                                \
+  (                                                             \
+    assert(PIN < LF_NUM_PINS_IN_THIS_FILE),                     \
+    my_atomic_storeptr(&(PINS)->pin[PIN], (ADDR))               \
+  )
+#else
+#define LF_REQUIRE_PINS(N)
+#define _lf_pin(PINS, PIN, ADDR)  my_atomic_storeptr(&(PINS)->pin[PIN], (ADDR))
+#endif
+
+#define _lf_unpin(PINS, PIN)      _lf_pin(PINS, PIN, NULL)
+#define lf_pin(PINS, PIN, ADDR)   \
+  do {                            \
+    lf_rwlock_by_pins(PINS);      \
+    _lf_pin(PINS, PIN, ADDR);     \
+    lf_rwunlock_by_pins(PINS);    \
+  } while (0)
+#define lf_unpin(PINS, PIN)  lf_pin(PINS, PIN, NULL)
+#define _lf_assert_pin(PINS, PIN) assert((PINS)->pin[PIN] != 0)
+#define _lf_assert_unpin(PINS, PIN) assert((PINS)->pin[PIN] == 0)
+
+void lf_pinbox_init(LF_PINBOX *pinbox, uint free_ptr_offset,
+                    lf_pinbox_free_func *free_func, void * free_func_arg);
+void lf_pinbox_destroy(LF_PINBOX *pinbox);
+
+lock_wrap(lf_pinbox_get_pins, LF_PINS *,
+          (LF_PINBOX *pinbox),
+          (pinbox),
+          &pinbox->pinarray.lock)
+lock_wrap_void(lf_pinbox_put_pins,
+               (LF_PINS *pins),
+               (pins),
+               &pins->pinbox->pinarray.lock)
+lock_wrap_void(lf_pinbox_free,
+               (LF_PINS *pins, void *addr),
+               (pins, addr),
+               &pins->pinbox->pinarray.lock)
+
+/*
+  memory allocator, lf_alloc-pin.c
+*/
+
+typedef struct st_lf_allocator {
+  LF_PINBOX pinbox;
+  uchar * volatile top;
+  uint element_size;
+  uint32 volatile mallocs;
+  void (*constructor)(uchar *); /* called, when an object is malloc()'ed */
+  void (*destructor)(uchar *);  /* called, when an object is free()'d    */
+} LF_ALLOCATOR;
+
+void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset);
+void lf_alloc_destroy(LF_ALLOCATOR *allocator);
+uint lf_alloc_pool_count(LF_ALLOCATOR *allocator);
+/*
+  shortcut macros to access underlying pinbox functions from an LF_ALLOCATOR
+  see _lf_pinbox_get_pins() and _lf_pinbox_put_pins()
+*/
+#define _lf_alloc_free(PINS, PTR)     _lf_pinbox_free((PINS), (PTR))
+#define lf_alloc_free(PINS, PTR)       lf_pinbox_free((PINS), (PTR))
+#define _lf_alloc_get_pins(A)         _lf_pinbox_get_pins(&(A)->pinbox)
+#define lf_alloc_get_pins(A)           lf_pinbox_get_pins(&(A)->pinbox)
+#define _lf_alloc_put_pins(PINS)      _lf_pinbox_put_pins(PINS)
+#define lf_alloc_put_pins(PINS)        lf_pinbox_put_pins(PINS)
+#define lf_alloc_direct_free(ALLOC, ADDR) my_free((uchar*)(ADDR), MYF(0))
+
+lock_wrap(lf_alloc_new, void *,
+          (LF_PINS *pins),
+          (pins),
+          &pins->pinbox->pinarray.lock)
+
+C_MODE_END
+
+/*
+  extendible hash, lf_hash.c
+*/
+#include <hash.h>
+
+C_MODE_START
+
+#define LF_HASH_UNIQUE 1
+
+/* lf_hash overhead per element (that is, sizeof(LF_SLIST) */
+extern const int LF_HASH_OVERHEAD;
+
+typedef struct {
+  LF_DYNARRAY array;                    /* hash itself */
+  LF_ALLOCATOR alloc;                   /* allocator for elements */
+  my_hash_get_key get_key;              /* see HASH */
+  CHARSET_INFO *charset;                /* see HASH */
+  uint key_offset, key_length;          /* see HASH */
+  uint element_size;                    /* size of memcpy'ed area on insert */
+  uint flags;                           /* LF_HASH_UNIQUE, etc */
+  int32 volatile size;                  /* size of array */
+  int32 volatile count;                 /* number of elements in the hash */
+} LF_HASH;
+
+void lf_hash_init(LF_HASH *hash, uint element_size, uint flags,
+                  uint key_offset, uint key_length, my_hash_get_key get_key,
+                  CHARSET_INFO *charset);
+void lf_hash_destroy(LF_HASH *hash);
+int lf_hash_insert(LF_HASH *hash, LF_PINS *pins, const void *data);
+void *lf_hash_search(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen);
+int lf_hash_delete(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen);
+/*
+  shortcut macros to access underlying pinbox functions from an LF_HASH
+  see _lf_pinbox_get_pins() and _lf_pinbox_put_pins()
+*/
+#define _lf_hash_get_pins(HASH)     _lf_alloc_get_pins(&(HASH)->alloc)
+#define lf_hash_get_pins(HASH)       lf_alloc_get_pins(&(HASH)->alloc)
+#define _lf_hash_put_pins(PINS)     _lf_pinbox_put_pins(PINS)
+#define lf_hash_put_pins(PINS)       lf_pinbox_put_pins(PINS)
+#define lf_hash_search_unpin(PINS)   lf_unpin((PINS), 2)
+/*
+  cleanup
+*/
+
+#undef lock_wrap_void
+#undef lock_wrap
+#undef nolock_wrap_void
+#undef nolock_wrap
+
+C_MODE_END
+
+#endif
+
diff --git a/include/my_atomic.h b/include/my_atomic.h
index f5da6e6a0d9..85cf87165fb 100644
--- a/include/my_atomic.h
+++ b/include/my_atomic.h
@@ -16,9 +16,51 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+/*
+  This header defines five atomic operations:
+
+  my_atomic_add#(&var, what)
+    add 'what' to *var, and return the old value of *var
+
+  my_atomic_fas#(&var, what)
+    'Fetch And Store'
+    store 'what' in *var, and return the old value of *var
+
+  my_atomic_cas#(&var, &old, new)
+    'Compare And Swap'
+    if *var is equal to *old, then store 'new' in *var, and return TRUE
+    otherwise store *var in *old, and return FALSE
+
+  my_atomic_load#(&var)
+    return *var
+
+  my_atomic_store#(&var, what)
+    store 'what' in *var
+
+  '#' is substituted by a size suffix - 8, 16, 32, or ptr
+  (e.g. my_atomic_add8, my_atomic_fas32, my_atomic_casptr).
+
+  NOTE This operations are not always atomic, so they always must be
+  enclosed in my_atomic_rwlock_rdlock(lock)/my_atomic_rwlock_rdunlock(lock)
+  or my_atomic_rwlock_wrlock(lock)/my_atomic_rwlock_wrunlock(lock).
+  Hint: if a code block makes intensive use of atomic ops, it make sense
+  to take/release rwlock once for the whole block, not for every statement.
+
+  On architectures where these operations are really atomic, rwlocks will
+  be optimized away.
+  8- and 16-bit atomics aren't implemented for windows (see generic-msvc.h),
+  but can be added, if necessary.
+*/
+
 #ifndef my_atomic_rwlock_init
 
 #define intptr         void *
+/**
+  On most platforms we implement 8-bit, 16-bit, 32-bit and "pointer"
+  operations. Thus the symbol below is defined by default; platforms
+  where we leave out 8-bit or 16-bit operations should undefine it.
+*/
+#define MY_ATOMIC_HAS_8_16 1
 
 #ifndef MY_ATOMIC_MODE_RWLOCKS
 /*
@@ -27,124 +69,223 @@
 #include "atomic/nolock.h"
 #endif
 
-#ifndef MY_ATOMIC_NOLOCK
-/*
- * Have to use rw-locks for atomic ops
- */
+#ifndef make_atomic_cas_body
+/* nolock.h was not able to generate even a CAS function, fall back */
 #include "atomic/rwlock.h"
-#endif
-
-#ifndef MY_ATOMICS_MADE
-
+#else
+/* define missing functions by using the already generated ones */
 #ifndef make_atomic_add_body
-#define make_atomic_add_body(S)					\
+#define make_atomic_add_body(S)                                 \
   int ## S tmp=*a;                                              \
   while (!my_atomic_cas ## S(a, &tmp, tmp+v));                  \
   v=tmp;
 #endif
+#ifndef make_atomic_fas_body
+#define make_atomic_fas_body(S)                                 \
+  int ## S tmp=*a;                                              \
+  while (!my_atomic_cas ## S(a, &tmp, v));                      \
+  v=tmp;
+#endif
+#ifndef make_atomic_load_body
+#define make_atomic_load_body(S)                                \
+  ret= 0; /* avoid compiler warning */                          \
+  (void)(my_atomic_cas ## S(a, &ret, ret));
+#endif
+#ifndef make_atomic_store_body
+#define make_atomic_store_body(S)                               \
+  (void)(my_atomic_fas ## S (a, v));
+#endif
+#endif
+
+/*
+  transparent_union doesn't work in g++
+  Bug ?
+
+  Darwin's gcc doesn't want to put pointers in a transparent_union
+  when built with -arch ppc64. Complains:
+  warning: 'transparent_union' attribute ignored
+*/
+#if defined(__GNUC__) && !defined(__cplusplus) && \
+      ! (defined(__APPLE__) && defined(_ARCH_PPC64))
+/*
+  we want to be able to use my_atomic_xxx functions with
+  both signed and unsigned integers. But gcc will issue a warning
+  "passing arg N of `my_atomic_XXX' as [un]signed due to prototype"
+  if the signedness of the argument doesn't match the prototype, or
+  "pointer targets in passing argument N of my_atomic_XXX differ in signedness"
+  if int* is used where uint* is expected (or vice versa).
+  Let's shut these warnings up
+*/
+#define make_transparent_unions(S)                              \
+        typedef union {                                         \
+          int  ## S  i;                                         \
+          uint ## S  u;                                         \
+        } U_ ## S   __attribute__ ((transparent_union));        \
+        typedef union {                                         \
+          int  ## S volatile *i;                                \
+          uint ## S volatile *u;                                \
+        } Uv_ ## S   __attribute__ ((transparent_union));
+#define uintptr intptr
+make_transparent_unions(8)
+make_transparent_unions(16)
+make_transparent_unions(32)
+make_transparent_unions(ptr)
+#undef uintptr
+#undef make_transparent_unions
+#define a       U_a.i
+#define cmp     U_cmp.i
+#define v       U_v.i
+#define set     U_set.i
+#else
+#define U_8    int8
+#define U_16   int16
+#define U_32   int32
+#define U_ptr  intptr
+#define Uv_8   int8
+#define Uv_16  int16
+#define Uv_32  int32
+#define Uv_ptr intptr
+#define U_a    volatile *a
+#define U_cmp  *cmp
+#define U_v    v
+#define U_set  set
+#endif /* __GCC__ transparent_union magic */
 
 #ifdef HAVE_INLINE
 
-#define make_atomic_add(S)					\
-STATIC_INLINE int ## S my_atomic_add ## S(			\
-                        int ## S volatile *a, int ## S v)	\
-{								\
-  make_atomic_add_body(S);					\
-  return v;							\
+#define make_atomic_cas(S)                                      \
+STATIC_INLINE int my_atomic_cas ## S(Uv_ ## S U_a,              \
+                            Uv_ ## S U_cmp, U_ ## S U_set)      \
+{                                                               \
+  int8 ret;                                                     \
+  make_atomic_cas_body(S);                                      \
+  return ret;                                                   \
 }
 
-#define make_atomic_swap(S)					\
-STATIC_INLINE int ## S my_atomic_swap ## S(			\
-                         int ## S volatile *a, int ## S v)	\
-{								\
-  make_atomic_swap_body(S);					\
-  return v;							\
+#define make_atomic_add(S)                                      \
+STATIC_INLINE int ## S my_atomic_add ## S(                      \
+                        Uv_ ## S U_a, U_ ## S U_v)              \
+{                                                               \
+  make_atomic_add_body(S);                                      \
+  return v;                                                     \
 }
 
-#define make_atomic_cas(S)					\
-STATIC_INLINE int my_atomic_cas ## S(int ## S volatile *a,	\
-                            int ## S *cmp, int ## S set)	\
-{								\
-  int8 ret;							\
-  make_atomic_cas_body(S);					\
-  return ret;							\
+#define make_atomic_fas(S)                                      \
+STATIC_INLINE int ## S my_atomic_fas ## S(                      \
+                         Uv_ ## S U_a, U_ ## S U_v)             \
+{                                                               \
+  make_atomic_fas_body(S);                                      \
+  return v;                                                     \
 }
 
-#define make_atomic_load(S)					\
-STATIC_INLINE int ## S my_atomic_load ## S(int ## S volatile *a) \
-{								\
-  int ## S ret;						\
-  make_atomic_load_body(S);					\
-  return ret;							\
+#define make_atomic_load(S)                                     \
+STATIC_INLINE int ## S my_atomic_load ## S(Uv_ ## S U_a)        \
+{                                                               \
+  int ## S ret;                                                 \
+  make_atomic_load_body(S);                                     \
+  return ret;                                                   \
 }
 
-#define make_atomic_store(S)					\
-STATIC_INLINE void my_atomic_store ## S(			\
-                     int ## S volatile *a, int ## S v)	\
-{								\
-  make_atomic_store_body(S);					\
+#define make_atomic_store(S)                                    \
+STATIC_INLINE void my_atomic_store ## S(                        \
+                     Uv_ ## S U_a, U_ ## S U_v)                 \
+{                                                               \
+  make_atomic_store_body(S);                                    \
 }
 
 #else /* no inline functions */
 
-#define make_atomic_add(S)					\
-extern int ## S my_atomic_add ## S(int ## S volatile *a, int ## S v);
+#define make_atomic_add(S)                                      \
+extern int ## S my_atomic_add ## S(Uv_ ## S U_a, U_ ## S U_v);
 
-#define make_atomic_swap(S)					\
-extern int ## S my_atomic_swap ## S(int ## S volatile *a, int ## S v);
+#define make_atomic_fas(S)                                      \
+extern int ## S my_atomic_fas ## S(Uv_ ## S U_a, U_ ## S U_v);
 
-#define make_atomic_cas(S)					\
-extern int my_atomic_cas ## S(int ## S volatile *a, int ## S *cmp, int ## S set);
+#define make_atomic_cas(S)                                      \
+extern int my_atomic_cas ## S(Uv_ ## S U_a, Uv_ ## S U_cmp, U_ ## S U_set);
 
-#define make_atomic_load(S)					\
-extern int ## S my_atomic_load ## S(int ## S volatile *a);
+#define make_atomic_load(S)                                     \
+extern int ## S my_atomic_load ## S(Uv_ ## S U_a);
 
-#define make_atomic_store(S)					\
-extern void my_atomic_store ## S(int ## S volatile *a, int ## S v);
+#define make_atomic_store(S)                                    \
+extern void my_atomic_store ## S(Uv_ ## S U_a, U_ ## S U_v);
 
 #endif /* HAVE_INLINE */
 
-make_atomic_cas( 8)
+#ifdef MY_ATOMIC_HAS_8_16
+make_atomic_cas(8)
 make_atomic_cas(16)
+#endif
 make_atomic_cas(32)
 make_atomic_cas(ptr)
 
-make_atomic_add( 8)
+#ifdef MY_ATOMIC_HAS_8_16
+make_atomic_add(8)
 make_atomic_add(16)
+#endif
 make_atomic_add(32)
 
-make_atomic_load( 8)
+#ifdef MY_ATOMIC_HAS_8_16
+make_atomic_load(8)
 make_atomic_load(16)
+#endif
 make_atomic_load(32)
 make_atomic_load(ptr)
 
-make_atomic_store( 8)
+#ifdef MY_ATOMIC_HAS_8_16
+make_atomic_fas(8)
+make_atomic_fas(16)
+#endif
+make_atomic_fas(32)
+make_atomic_fas(ptr)
+
+#ifdef MY_ATOMIC_HAS_8_16
+make_atomic_store(8)
 make_atomic_store(16)
+#endif
 make_atomic_store(32)
 make_atomic_store(ptr)
 
-make_atomic_swap( 8)
-make_atomic_swap(16)
-make_atomic_swap(32)
-make_atomic_swap(ptr)
+#ifdef _atomic_h_cleanup_
+#include _atomic_h_cleanup_
+#undef _atomic_h_cleanup_
+#endif
 
+#undef U_8
+#undef U_16
+#undef U_32
+#undef U_ptr
+#undef Uv_8
+#undef Uv_16
+#undef Uv_32
+#undef Uv_ptr
+#undef a
+#undef cmp
+#undef v
+#undef set
+#undef U_a
+#undef U_cmp
+#undef U_v
+#undef U_set
 #undef make_atomic_add
 #undef make_atomic_cas
 #undef make_atomic_load
 #undef make_atomic_store
-#undef make_atomic_swap
+#undef make_atomic_fas
 #undef make_atomic_add_body
 #undef make_atomic_cas_body
 #undef make_atomic_load_body
 #undef make_atomic_store_body
-#undef make_atomic_swap_body
+#undef make_atomic_fas_body
 #undef intptr
 
-#endif /* MY_ATOMICS_MADE */
-
-#ifdef _atomic_h_cleanup_
-#include _atomic_h_cleanup_
-#undef _atomic_h_cleanup_
+/*
+  the macro below defines (as an expression) the code that
+  will be run in spin-loops. Intel manuals recummend to have PAUSE there.
+  It is expected to be defined in include/atomic/ *.h files
+*/
+#ifndef LF_BACKOFF
+#define LF_BACKOFF (1)
 #endif
 
 #define MY_ATOMIC_OK       0
diff --git a/include/my_pthread.h b/include/my_pthread.h
index eff6d654f2e..3b95e2adb73 100644
--- a/include/my_pthread.h
+++ b/include/my_pthread.h
@@ -148,6 +148,7 @@ int pthread_join(pthread_t thread, void **value_ptr);
 #define pthread_detach_this_thread()
 #define pthread_condattr_init(A)
 #define pthread_condattr_destroy(A)
+#define pthread_yield() SwitchToThread()
 
 /* per the platform's documentation */
 #define pthread_yield() Sleep(0)
@@ -354,6 +355,17 @@ void my_pthread_attr_getstacksize(pthread_attr_t *attrib, size_t *size);
 int my_pthread_mutex_trylock(pthread_mutex_t *mutex);
 #endif
 
+#if !defined(HAVE_PTHREAD_YIELD_ONE_ARG) && !defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
+/* no pthread_yield() available */
+#ifdef HAVE_SCHED_YIELD
+#define pthread_yield() sched_yield()
+#elif defined(HAVE_PTHREAD_YIELD_NP) /* can be Mac OS X */
+#define pthread_yield() pthread_yield_np()
+#elif defined(HAVE_THR_YIELD)
+#define pthread_yield() thr_yield()
+#endif
+#endif
+
 /*
   The defines set_timespec and set_timespec_nsec should be used
   for calculating an absolute time at which
@@ -632,6 +644,7 @@ struct st_my_thread_var
   my_bool init;
   struct st_my_thread_var *next,**prev;
   void *opt_info;
+  void  *stack_ends_here;
 #ifndef DBUG_OFF
   void *dbug;
   char name[THREAD_NAME_SIZE+1];
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt
index 21b04e7c968..757e37aa0a9 100755
--- a/mysys/CMakeLists.txt
+++ b/mysys/CMakeLists.txt
@@ -39,7 +39,9 @@ SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c default.c default_
 				my_static.c my_symlink.c my_symlink2.c my_sync.c my_thr_init.c my_wincond.c
 				my_winerr.c my_winfile.c my_windac.c my_winthread.c my_write.c ptr_cmp.c queues.c stacktrace.c
 				rijndael.c safemalloc.c sha1.c string.c thr_alarm.c thr_lock.c thr_mutex.c
-				thr_rwlock.c tree.c typelib.c my_vle.c base64.c my_memmem.c my_getpagesize.c)
+				thr_rwlock.c tree.c typelib.c my_vle.c base64.c my_memmem.c my_getpagesize.c
+				lf_alloc-pin.c lf_dynarray.c lf_hash.c
+				my_atomic.c my_getncpus.c)
 
 IF(NOT SOURCE_SUBLIBS)
   ADD_LIBRARY(mysys ${MYSYS_SOURCES})
diff --git a/mysys/Makefile.am b/mysys/Makefile.am
index 68fde34ee07..3b53fc1dd59 100644
--- a/mysys/Makefile.am
+++ b/mysys/Makefile.am
@@ -30,7 +30,8 @@ libmysys_a_SOURCES =    my_init.c my_getwd.c mf_getdate.c my_mmap.c \
 			mf_tempdir.c my_lock.c mf_brkhant.c my_alarm.c \
 			my_malloc.c my_realloc.c my_once.c mulalloc.c \
 			my_alloc.c safemalloc.c my_new.cc \
-			my_vle.c my_atomic.c \
+			my_vle.c my_atomic.c lf_hash.c \
+			lf_dynarray.c lf_alloc-pin.c \
 			my_fopen.c my_fstream.c my_getsystime.c \
 			my_error.c errors.c my_div.c my_messnc.c \
 			mf_format.c mf_same.c mf_dirname.c mf_fn_ext.c \
diff --git a/mysys/lf_alloc-pin.c b/mysys/lf_alloc-pin.c
new file mode 100644
index 00000000000..fda9b97791d
--- /dev/null
+++ b/mysys/lf_alloc-pin.c
@@ -0,0 +1,534 @@
+/* QQ: TODO multi-pinbox */
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  wait-free concurrent allocator based on pinning addresses
+
+  It works as follows: every thread (strictly speaking - every CPU, but
+  it's too difficult to do) has a small array of pointers. They're called
+  "pins".  Before using an object its address must be stored in this array
+  (pinned).  When an object is no longer necessary its address must be
+  removed from this array (unpinned). When a thread wants to free() an
+  object it scans all pins of all threads to see if somebody has this
+  object pinned.  If yes - the object is not freed (but stored in a
+  "purgatory").  To reduce the cost of a single free() pins are not scanned
+  on every free() but only added to (thread-local) purgatory. On every
+  LF_PURGATORY_SIZE free() purgatory is scanned and all unpinned objects
+  are freed.
+
+  Pins are used to solve ABA problem. To use pins one must obey
+  a pinning protocol:
+
+   1. Let's assume that PTR is a shared pointer to an object. Shared means
+      that any thread may modify it anytime to point to a different object
+      and free the old object. Later the freed object may be potentially
+      allocated by another thread. If we're unlucky that other thread may
+      set PTR to point to this object again. This is ABA problem.
+   2. Create a local pointer LOCAL_PTR.
+   3. Pin the PTR in a loop:
+      do
+      {
+        LOCAL_PTR= PTR;
+        pin(PTR, PIN_NUMBER);
+      } while (LOCAL_PTR != PTR)
+   4. It is guaranteed that after the loop has ended, LOCAL_PTR
+      points to an object (or NULL, if PTR may be NULL), that
+      will never be freed. It is not guaranteed though
+      that LOCAL_PTR == PTR (as PTR can change any time)
+   5. When done working with the object, remove the pin:
+      unpin(PIN_NUMBER)
+   6. When copying pins (as in the list traversing loop:
+        pin(CUR, 1);
+        while ()
+        {
+          do                            // standard
+          {                             //  pinning
+            NEXT=CUR->next;             //   loop
+            pin(NEXT, 0);               //    see #3
+          } while (NEXT != CUR->next);  //     above
+          ...
+          ...
+          CUR=NEXT;
+          pin(CUR, 1);                  // copy pin[0] to pin[1]
+        }
+      which keeps CUR address constantly pinned), note than pins may be
+      copied only upwards (!!!), that is pin[N] to pin[M], M > N.
+   7. Don't keep the object pinned longer than necessary - the number of
+      pins you have is limited (and small), keeping an object pinned
+      prevents its reuse and cause unnecessary mallocs.
+
+  Explanations:
+
+   3. The loop is important. The following can occur:
+        thread1> LOCAL_PTR= PTR
+        thread2> free(PTR); PTR=0;
+        thread1> pin(PTR, PIN_NUMBER);
+      now thread1 cannot access LOCAL_PTR, even if it's pinned,
+      because it points to a freed memory. That is, it *must*
+      verify that it has indeed pinned PTR, the shared pointer.
+
+   6. When a thread wants to free some LOCAL_PTR, and it scans
+      all lists of pins to see whether it's pinned, it does it
+      upwards, from low pin numbers to high. Thus another thread
+      must copy an address from one pin to another in the same
+      direction - upwards, otherwise the scanning thread may
+      miss it.
+
+  Implementation details:
+
+  Pins are given away from a "pinbox". Pinbox is stack-based allocator.
+  It used dynarray for storing pins, new elements are allocated by dynarray
+  as necessary, old are pushed in the stack for reuse. ABA is solved by
+  versioning a pointer - because we use an array, a pointer to pins is 16 bit,
+  upper 16 bits are used for a version.
+
+  It is assumed that pins belong to a THD and are not transferable
+  between THD's (LF_PINS::stack_ends_here being a primary reason
+  for this limitation).
+*/
+#include <my_global.h>
+#include <my_sys.h>
+#include <lf.h>
+
+#define LF_PINBOX_MAX_PINS 65536
+
+static void _lf_pinbox_real_free(LF_PINS *pins);
+
+/*
+  Initialize a pinbox. Normally called from lf_alloc_init.
+  See the latter for details.
+*/
+void lf_pinbox_init(LF_PINBOX *pinbox, uint free_ptr_offset,
+                    lf_pinbox_free_func *free_func, void *free_func_arg)
+{
+  DBUG_ASSERT(free_ptr_offset % sizeof(void *) == 0);
+  compile_time_assert(sizeof(LF_PINS) == 64);
+  lf_dynarray_init(&pinbox->pinarray, sizeof(LF_PINS));
+  pinbox->pinstack_top_ver= 0;
+  pinbox->pins_in_array= 0;
+  pinbox->free_ptr_offset= free_ptr_offset;
+  pinbox->free_func= free_func;
+  pinbox->free_func_arg= free_func_arg;
+}
+
+void lf_pinbox_destroy(LF_PINBOX *pinbox)
+{
+  lf_dynarray_destroy(&pinbox->pinarray);
+}
+
+/*
+  Get pins from a pinbox. Usually called via lf_alloc_get_pins() or
+  lf_hash_get_pins().
+
+  SYNOPSYS
+    pinbox      -
+
+  DESCRIPTION
+    get a new LF_PINS structure from a stack of unused pins,
+    or allocate a new one out of dynarray.
+
+  NOTE
+    It is assumed that pins belong to a thread and are not transferable
+    between threads.
+*/
+LF_PINS *_lf_pinbox_get_pins(LF_PINBOX *pinbox)
+{
+  uint32 pins, next, top_ver;
+  LF_PINS *el;
+  /*
+    We have an array of max. 64k elements.
+    The highest index currently allocated is pinbox->pins_in_array.
+    Freed elements are in a lifo stack, pinstack_top_ver.
+    pinstack_top_ver is 32 bits; 16 low bits are the index in the
+    array, to the first element of the list. 16 high bits are a version
+    (every time the 16 low bits are updated, the 16 high bits are
+    incremented). Versioniong prevents the ABA problem.
+  */
+  top_ver= pinbox->pinstack_top_ver;
+  do
+  {
+    if (!(pins= top_ver % LF_PINBOX_MAX_PINS))
+    {
+      /* the stack of free elements is empty */
+      pins= my_atomic_add32((int32 volatile*) &pinbox->pins_in_array, 1)+1;
+      if (unlikely(pins >= LF_PINBOX_MAX_PINS))
+        return 0;
+      /*
+        note that the first allocated element has index 1 (pins==1).
+        index 0 is reserved to mean "NULL pointer"
+      */
+      el= (LF_PINS *)_lf_dynarray_lvalue(&pinbox->pinarray, pins);
+      if (unlikely(!el))
+        return 0;
+      break;
+    }
+    el= (LF_PINS *)_lf_dynarray_value(&pinbox->pinarray, pins);
+    next= el->link;
+  } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver,
+                            (int32*) &top_ver,
+                            top_ver-pins+next+LF_PINBOX_MAX_PINS));
+  /*
+    set el->link to the index of el in the dynarray (el->link has two usages:
+    - if element is allocated, it's its own index
+    - if element is free, it's its next element in the free stack
+  */
+  el->link= pins;
+  el->purgatory_count= 0;
+  el->pinbox= pinbox;
+  el->stack_ends_here= & my_thread_var->stack_ends_here;
+  return el;
+}
+
+/*
+  Put pins back to a pinbox. Usually called via lf_alloc_put_pins() or
+  lf_hash_put_pins().
+
+  DESCRIPTION
+    empty the purgatory (XXX deadlock warning below!),
+    push LF_PINS structure to a stack
+*/
+void _lf_pinbox_put_pins(LF_PINS *pins)
+{
+  LF_PINBOX *pinbox= pins->pinbox;
+  uint32 top_ver, nr;
+  nr= pins->link;
+#ifdef MY_LF_EXTRA_DEBUG
+  {
+    int i;
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+      DBUG_ASSERT(pins->pin[i] == 0);
+  }
+#endif
+  /*
+    XXX this will deadlock if other threads will wait for
+    the caller to do something after _lf_pinbox_put_pins(),
+    and they would have pinned addresses that the caller wants to free.
+    Thus: only free pins when all work is done and nobody can wait for you!!!
+  */
+  while (pins->purgatory_count)
+  {
+    _lf_pinbox_real_free(pins);
+    if (pins->purgatory_count)
+    {
+      my_atomic_rwlock_wrunlock(&pins->pinbox->pinarray.lock);
+      pthread_yield();
+      my_atomic_rwlock_wrlock(&pins->pinbox->pinarray.lock);
+    }
+  }
+  top_ver= pinbox->pinstack_top_ver;
+  do
+  {
+    pins->link= top_ver % LF_PINBOX_MAX_PINS;
+  } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver,
+                            (int32*) &top_ver,
+                            top_ver-pins->link+nr+LF_PINBOX_MAX_PINS));
+  return;
+}
+
+static int ptr_cmp(void **a, void **b)
+{
+  return *a < *b ? -1 : *a == *b ? 0 : 1;
+}
+
+#define add_to_purgatory(PINS, ADDR)                                    \
+  do                                                                    \
+  {                                                                     \
+    *(void **)((char *)(ADDR)+(PINS)->pinbox->free_ptr_offset)=         \
+      (PINS)->purgatory;                                                \
+    (PINS)->purgatory= (ADDR);                                          \
+    (PINS)->purgatory_count++;                                          \
+  } while (0)
+
+/*
+  Free an object allocated via pinbox allocator
+
+  DESCRIPTION
+    add an object to purgatory. if necessary, call _lf_pinbox_real_free()
+    to actually free something.
+*/
+void _lf_pinbox_free(LF_PINS *pins, void *addr)
+{
+  add_to_purgatory(pins, addr);
+  if (pins->purgatory_count % LF_PURGATORY_SIZE)
+    _lf_pinbox_real_free(pins);
+}
+
+struct st_harvester {
+  void **granary;
+  int npins;
+};
+
+/*
+  callback for _lf_dynarray_iterate:
+  scan all pins of all threads and accumulate all pins
+*/
+static int harvest_pins(LF_PINS *el, struct st_harvester *hv)
+{
+  int i;
+  LF_PINS *el_end= el+min(hv->npins, LF_DYNARRAY_LEVEL_LENGTH);
+  for (; el < el_end; el++)
+  {
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+    {
+      void *p= el->pin[i];
+      if (p)
+        *hv->granary++= p;
+    }
+  }
+  /*
+    hv->npins may become negative below, but it means that
+    we're on the last dynarray page and harvest_pins() won't be
+    called again. We don't bother to make hv->npins() correct
+    (that is 0) in this case.
+  */
+  hv->npins-= LF_DYNARRAY_LEVEL_LENGTH;
+  return 0;
+}
+
+/*
+  callback for _lf_dynarray_iterate:
+  scan all pins of all threads and see if addr is present there
+*/
+static int match_pins(LF_PINS *el, void *addr)
+{
+  int i;
+  LF_PINS *el_end= el+LF_DYNARRAY_LEVEL_LENGTH;
+  for (; el < el_end; el++)
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+      if (el->pin[i] == addr)
+        return 1;
+  return 0;
+}
+
+#if STACK_DIRECTION < 0
+#define available_stack_size(CUR,END) (long) ((char*)(CUR) - (char*)(END))
+#else
+#define available_stack_size(CUR,END) (long) ((char*)(END) - (char*)(CUR))
+#endif
+
+#define next_node(P, X) (*((uchar * volatile *)(((uchar *)(X)) + (P)->free_ptr_offset)))
+#define anext_node(X) next_node(&allocator->pinbox, (X))
+
+/*
+  Scan the purgatory and free everything that can be freed
+*/
+static void _lf_pinbox_real_free(LF_PINS *pins)
+{
+  int npins, alloca_size;
+  void *list, **addr;
+  void *first, *last= NULL;
+  LF_PINBOX *pinbox= pins->pinbox;
+
+  LINT_INIT(first);
+  npins= pinbox->pins_in_array+1;
+
+#ifdef HAVE_ALLOCA
+  alloca_size= sizeof(void *)*LF_PINBOX_PINS*npins;
+  /* create a sorted list of pinned addresses, to speed up searches */
+  if (available_stack_size(&pinbox, *pins->stack_ends_here) > alloca_size)
+  {
+    struct st_harvester hv;
+    addr= (void **) alloca(alloca_size);
+    hv.granary= addr;
+    hv.npins= npins;
+    /* scan the dynarray and accumulate all pinned addresses */
+    _lf_dynarray_iterate(&pinbox->pinarray,
+                         (lf_dynarray_func)harvest_pins, &hv);
+
+    npins= hv.granary-addr;
+    /* and sort them */
+    if (npins)
+      qsort(addr, npins, sizeof(void *), (qsort_cmp)ptr_cmp);
+  }
+  else
+#endif
+    addr= 0;
+
+  list= pins->purgatory;
+  pins->purgatory= 0;
+  pins->purgatory_count= 0;
+  while (list)
+  {
+    void *cur= list;
+    list= *(void **)((char *)cur+pinbox->free_ptr_offset);
+    if (npins)
+    {
+      if (addr) /* use binary search */
+      {
+        void **a, **b, **c;
+        for (a= addr, b= addr+npins-1, c= a+(b-a)/2; (b-a) > 1; c= a+(b-a)/2)
+          if (cur == *c)
+            a= b= c;
+          else if (cur > *c)
+            a= c;
+          else
+            b= c;
+        if (cur == *a || cur == *b)
+          goto found;
+      }
+      else /* no alloca - no cookie. linear search here */
+      {
+        if (_lf_dynarray_iterate(&pinbox->pinarray,
+                                 (lf_dynarray_func)match_pins, cur))
+          goto found;
+      }
+    }
+    /* not pinned - freeing */
+    if (last)
+      last= next_node(pinbox, last)= (uchar *)cur;
+    else
+      first= last= (uchar *)cur;
+    continue;
+found:
+    /* pinned - keeping */
+    add_to_purgatory(pins, cur);
+  }
+  if (last)
+    pinbox->free_func(first, last, pinbox->free_func_arg);
+}
+
+/* lock-free memory allocator for fixed-size objects */
+
+LF_REQUIRE_PINS(1)
+
+/*
+  callback for _lf_pinbox_real_free to free a list of unpinned objects -
+  add it back to the allocator stack
+
+  DESCRIPTION
+    'first' and 'last' are the ends of the linked list of nodes:
+    first->el->el->....->el->last. Use first==last to free only one element.
+*/
+static void alloc_free(uchar *first,
+                       uchar volatile *last,
+                       LF_ALLOCATOR *allocator)
+{
+  /*
+    we need a union here to access type-punned pointer reliably.
+    otherwise gcc -fstrict-aliasing will not see 'tmp' changed in the loop
+  */
+  union { uchar * node; void *ptr; } tmp;
+  tmp.node= allocator->top;
+  do
+  {
+    anext_node(last)= tmp.node;
+  } while (!my_atomic_casptr((void **)(char *)&allocator->top,
+                             (void **)&tmp.ptr, first) && LF_BACKOFF);
+}
+
+/*
+  initialize lock-free allocator
+
+  SYNOPSYS
+    allocator           -
+    size                a size of an object to allocate
+    free_ptr_offset     an offset inside the object to a sizeof(void *)
+                        memory that is guaranteed to be unused after
+                        the object is put in the purgatory. Unused by ANY
+                        thread, not only the purgatory owner.
+                        This memory will be used to link waiting-to-be-freed
+                        objects in a purgatory list.
+*/
+void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset)
+{
+  lf_pinbox_init(&allocator->pinbox, free_ptr_offset,
+                 (lf_pinbox_free_func *)alloc_free, allocator);
+  allocator->top= 0;
+  allocator->mallocs= 0;
+  allocator->element_size= size;
+  allocator->constructor= 0;
+  allocator->destructor= 0;
+  DBUG_ASSERT(size >= sizeof(void*) + free_ptr_offset);
+}
+
+/*
+  destroy the allocator, free everything that's in it
+
+  NOTE
+    As every other init/destroy function here and elsewhere it
+    is not thread safe. No, this function is no different, ensure
+    that no thread needs the allocator before destroying it.
+    We are not responsible for any damage that may be caused by
+    accessing the allocator when it is being or has been destroyed.
+    Oh yes, and don't put your cat in a microwave.
+*/
+void lf_alloc_destroy(LF_ALLOCATOR *allocator)
+{
+  uchar *node= allocator->top;
+  while (node)
+  {
+    uchar *tmp= anext_node(node);
+    if (allocator->destructor)
+      allocator->destructor(node);
+    my_free((void *)node, MYF(0));
+    node= tmp;
+  }
+  lf_pinbox_destroy(&allocator->pinbox);
+  allocator->top= 0;
+}
+
+/*
+  Allocate and return an new object.
+
+  DESCRIPTION
+    Pop an unused object from the stack or malloc it is the stack is empty.
+    pin[0] is used, it's removed on return.
+*/
+void *_lf_alloc_new(LF_PINS *pins)
+{
+  LF_ALLOCATOR *allocator= (LF_ALLOCATOR *)(pins->pinbox->free_func_arg);
+  uchar *node;
+  for (;;)
+  {
+    do
+    {
+      node= allocator->top;
+      _lf_pin(pins, 0, node);
+    } while (node != allocator->top && LF_BACKOFF);
+    if (!node)
+    {
+      node= (void *)my_malloc(allocator->element_size, MYF(MY_WME));
+      if (allocator->constructor)
+        allocator->constructor(node);
+#ifdef MY_LF_EXTRA_DEBUG
+      if (likely(node != 0))
+        my_atomic_add32(&allocator->mallocs, 1);
+#endif
+      break;
+    }
+    if (my_atomic_casptr((void **)(char *)&allocator->top,
+                         (void *)&node, anext_node(node)))
+      break;
+  }
+  _lf_unpin(pins, 0);
+  return node;
+}
+
+/*
+  count the number of objects in a pool.
+
+  NOTE
+    This is NOT thread-safe !!!
+*/
+uint lf_alloc_pool_count(LF_ALLOCATOR *allocator)
+{
+  uint i;
+  uchar *node;
+  for (node= allocator->top, i= 0; node; node= anext_node(node), i++)
+    /* no op */;
+  return i;
+}
+
diff --git a/mysys/lf_dynarray.c b/mysys/lf_dynarray.c
new file mode 100644
index 00000000000..b1cdce698a9
--- /dev/null
+++ b/mysys/lf_dynarray.c
@@ -0,0 +1,207 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Analog of DYNAMIC_ARRAY that never reallocs
+  (so no pointer into the array may ever become invalid).
+
+  Memory is allocated in non-contiguous chunks.
+  This data structure is not space efficient for sparse arrays.
+
+  Every element is aligned to sizeof(element) boundary
+  (to avoid false sharing if element is big enough).
+
+  LF_DYNARRAY is a recursive structure. On the zero level
+  LF_DYNARRAY::level[0] it's an array of LF_DYNARRAY_LEVEL_LENGTH elements,
+  on the first level it's an array of LF_DYNARRAY_LEVEL_LENGTH pointers
+  to arrays of elements, on the second level it's an array of pointers
+  to arrays of pointers to arrays of elements. And so on.
+
+  With four levels the number of elements is limited to 4311810304
+  (but as in all functions index is uint, the real limit is 2^32-1)
+
+  Actually, it's wait-free, not lock-free ;-)
+*/
+
+#include <my_global.h>
+#include <m_string.h>
+#include <my_sys.h>
+#include <lf.h>
+
+void lf_dynarray_init(LF_DYNARRAY *array, uint element_size)
+{
+  bzero(array, sizeof(*array));
+  array->size_of_element= element_size;
+  my_atomic_rwlock_init(&array->lock);
+}
+
+static void recursive_free(void **alloc, int level)
+{
+  if (!alloc)
+    return;
+
+  if (level)
+  {
+    int i;
+    for (i= 0; i < LF_DYNARRAY_LEVEL_LENGTH; i++)
+      recursive_free(alloc[i], level-1);
+    my_free((void *)alloc, MYF(0));
+  }
+  else
+    my_free(alloc[-1], MYF(0));
+}
+
+void lf_dynarray_destroy(LF_DYNARRAY *array)
+{
+  int i;
+  for (i= 0; i < LF_DYNARRAY_LEVELS; i++)
+    recursive_free(array->level[i], i);
+  my_atomic_rwlock_destroy(&array->lock);
+}
+
+static const ulong dynarray_idxes_in_prev_levels[LF_DYNARRAY_LEVELS]=
+{
+  0, /* +1 here to to avoid -1's below */
+  LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH +
+    LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH + LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH + LF_DYNARRAY_LEVEL_LENGTH
+};
+
+static const ulong dynarray_idxes_in_prev_level[LF_DYNARRAY_LEVELS]=
+{
+  0, /* +1 here to to avoid -1's below */
+  LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH,
+};
+
+/*
+  Returns a valid lvalue pointer to the element number 'idx'.
+  Allocates memory if necessary.
+*/
+void *_lf_dynarray_lvalue(LF_DYNARRAY *array, uint idx)
+{
+  void * ptr, * volatile * ptr_ptr= 0;
+  int i;
+
+  for (i= LF_DYNARRAY_LEVELS-1; idx < dynarray_idxes_in_prev_levels[i]; i--)
+    /* no-op */;
+  ptr_ptr= &array->level[i];
+  idx-= dynarray_idxes_in_prev_levels[i];
+  for (; i > 0; i--)
+  {
+    if (!(ptr= *ptr_ptr))
+    {
+      void *alloc= my_malloc(LF_DYNARRAY_LEVEL_LENGTH * sizeof(void *),
+                             MYF(MY_WME|MY_ZEROFILL));
+      if (unlikely(!alloc))
+        return(NULL);
+      if (my_atomic_casptr(ptr_ptr, &ptr, alloc))
+        ptr= alloc;
+      else
+        my_free(alloc, MYF(0));
+    }
+    ptr_ptr= ((void **)ptr) + idx / dynarray_idxes_in_prev_level[i];
+    idx%= dynarray_idxes_in_prev_level[i];
+  }
+  if (!(ptr= *ptr_ptr))
+  {
+    uchar *alloc, *data;
+    alloc= my_malloc(LF_DYNARRAY_LEVEL_LENGTH * array->size_of_element +
+                    max(array->size_of_element, sizeof(void *)),
+                    MYF(MY_WME|MY_ZEROFILL));
+    if (unlikely(!alloc))
+      return(NULL);
+    /* reserve the space for free() address */
+    data= alloc + sizeof(void *);
+    { /* alignment */
+      intptr mod= ((intptr)data) % array->size_of_element;
+      if (mod)
+        data+= array->size_of_element - mod;
+    }
+    ((void **)data)[-1]= alloc; /* free() will need the original pointer */
+    if (my_atomic_casptr(ptr_ptr, &ptr, data))
+      ptr= data;
+    else
+      my_free(alloc, MYF(0));
+  }
+  return ((uchar*)ptr) + array->size_of_element * idx;
+}
+
+/*
+  Returns a pointer to the element number 'idx'
+  or NULL if an element does not exists
+*/
+void *_lf_dynarray_value(LF_DYNARRAY *array, uint idx)
+{
+  void * ptr, * volatile * ptr_ptr= 0;
+  int i;
+
+  for (i= LF_DYNARRAY_LEVELS-1; idx < dynarray_idxes_in_prev_levels[i]; i--)
+    /* no-op */;
+  ptr_ptr= &array->level[i];
+  idx-= dynarray_idxes_in_prev_levels[i];
+  for (; i > 0; i--)
+  {
+    if (!(ptr= *ptr_ptr))
+      return(NULL);
+    ptr_ptr= ((void **)ptr) + idx / dynarray_idxes_in_prev_level[i];
+    idx %= dynarray_idxes_in_prev_level[i];
+  }
+  if (!(ptr= *ptr_ptr))
+    return(NULL);
+  return ((uchar*)ptr) + array->size_of_element * idx;
+}
+
+static int recursive_iterate(LF_DYNARRAY *array, void *ptr, int level,
+                             lf_dynarray_func func, void *arg)
+{
+  int res, i;
+  if (!ptr)
+    return 0;
+  if (!level)
+    return func(ptr, arg);
+  for (i= 0; i < LF_DYNARRAY_LEVEL_LENGTH; i++)
+    if ((res= recursive_iterate(array, ((void **)ptr)[i], level-1, func, arg)))
+      return res;
+  return 0;
+}
+
+/*
+  Calls func(array, arg) on every array of LF_DYNARRAY_LEVEL_LENGTH elements
+  in lf_dynarray.
+
+  DESCRIPTION
+    lf_dynarray consists of a set of arrays, LF_DYNARRAY_LEVEL_LENGTH elements
+    each. _lf_dynarray_iterate() calls user-supplied function on every array
+    from the set. It is the fastest way to scan the array, faster than
+      for (i=0; i < N; i++) { func(_lf_dynarray_value(dynarray, i)); }
+
+  NOTE
+    if func() returns non-zero, the scan is aborted
+*/
+int _lf_dynarray_iterate(LF_DYNARRAY *array, lf_dynarray_func func, void *arg)
+{
+  int i, res;
+  for (i= 0; i < LF_DYNARRAY_LEVELS; i++)
+    if ((res= recursive_iterate(array, array->level[i], i, func, arg)))
+      return res;
+  return 0;
+}
+
diff --git a/mysys/lf_hash.c b/mysys/lf_hash.c
new file mode 100644
index 00000000000..f478196c7c8
--- /dev/null
+++ b/mysys/lf_hash.c
@@ -0,0 +1,505 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  extensible hash
+
+  TODO
+     try to get rid of dummy nodes ?
+     for non-unique hash, count only _distinct_ values
+     (but how to do it in lf_hash_delete ?)
+*/
+#include <my_global.h>
+#include <m_string.h>
+#include <my_sys.h>
+#include <my_bit.h>
+#include <lf.h>
+
+LF_REQUIRE_PINS(3)
+
+/* An element of the list */
+typedef struct {
+  intptr volatile link; /* a pointer to the next element in a listand a flag */
+  uint32 hashnr;        /* reversed hash number, for sorting                 */
+  const uchar *key;
+  size_t keylen;
+  /*
+    data is stored here, directly after the keylen.
+    thus the pointer to data is (void*)(slist_element_ptr+1)
+  */
+} LF_SLIST;
+
+const int LF_HASH_OVERHEAD= sizeof(LF_SLIST);
+
+/*
+  a structure to pass the context (pointers two the three successive elements
+  in a list) from lfind to linsert/ldelete
+*/
+typedef struct {
+  intptr volatile *prev;
+  LF_SLIST *curr, *next;
+} CURSOR;
+
+/*
+  the last bit in LF_SLIST::link is a "deleted" flag.
+  the helper macros below convert it to a pure pointer or a pure flag
+*/
+#define PTR(V)      (LF_SLIST *)((V) & (~(intptr)1))
+#define DELETED(V)  ((V) & 1)
+
+/*
+  DESCRIPTION
+    Search for hashnr/key/keylen in the list starting from 'head' and
+    position the cursor. The list is ORDER BY hashnr, key
+
+  RETURN
+    0 - not found
+    1 - found
+
+  NOTE
+    cursor is positioned in either case
+    pins[0..2] are used, they are NOT removed on return
+*/
+static int lfind(LF_SLIST * volatile *head, CHARSET_INFO *cs, uint32 hashnr,
+                 const uchar *key, uint keylen, CURSOR *cursor, LF_PINS *pins)
+{
+  uint32       cur_hashnr;
+  const uchar  *cur_key;
+  uint         cur_keylen;
+  intptr       link;
+
+retry:
+  cursor->prev= (intptr *)head;
+  do { /* PTR() isn't necessary below, head is a dummy node */
+    cursor->curr= (LF_SLIST *)(*cursor->prev);
+    _lf_pin(pins, 1, cursor->curr);
+  } while (*cursor->prev != (intptr)cursor->curr && LF_BACKOFF);
+  for (;;)
+  {
+    if (unlikely(!cursor->curr))
+      return 0; /* end of the list */
+    do {
+      /* QQ: XXX or goto retry ? */
+      link= cursor->curr->link;
+      cursor->next= PTR(link);
+      _lf_pin(pins, 0, cursor->next);
+    } while (link != cursor->curr->link && LF_BACKOFF);
+    cur_hashnr= cursor->curr->hashnr;
+    cur_key= cursor->curr->key;
+    cur_keylen= cursor->curr->keylen;
+    if (*cursor->prev != (intptr)cursor->curr)
+    {
+      (void)LF_BACKOFF;
+      goto retry;
+    }
+    if (!DELETED(link))
+    {
+      if (cur_hashnr >= hashnr)
+      {
+        int r= 1;
+        if (cur_hashnr > hashnr ||
+            (r= my_strnncoll(cs, (uchar*) cur_key, cur_keylen, (uchar*) key,
+                             keylen)) >= 0)
+          return !r;
+      }
+      cursor->prev= &(cursor->curr->link);
+      _lf_pin(pins, 2, cursor->curr);
+    }
+    else
+    {
+      /*
+        we found a deleted node - be nice, help the other thread
+        and remove this deleted node
+      */
+      if (my_atomic_casptr((void **)cursor->prev,
+                           (void **)&cursor->curr, cursor->next))
+        _lf_alloc_free(pins, cursor->curr);
+      else
+      {
+        (void)LF_BACKOFF;
+        goto retry;
+      }
+    }
+    cursor->curr= cursor->next;
+    _lf_pin(pins, 1, cursor->curr);
+  }
+}
+
+/*
+  DESCRIPTION
+    insert a 'node' in the list that starts from 'head' in the correct
+    position (as found by lfind)
+
+  RETURN
+    0     - inserted
+    not 0 - a pointer to a duplicate (not pinned and thus unusable)
+
+  NOTE
+    it uses pins[0..2], on return all pins are removed.
+    if there're nodes with the same key value, a new node is added before them.
+*/
+static LF_SLIST *linsert(LF_SLIST * volatile *head, CHARSET_INFO *cs,
+                         LF_SLIST *node, LF_PINS *pins, uint flags)
+{
+  CURSOR         cursor;
+  int            res;
+
+  for (;;)
+  {
+    if (lfind(head, cs, node->hashnr, node->key, node->keylen,
+              &cursor, pins) &&
+        (flags & LF_HASH_UNIQUE))
+    {
+      res= 0; /* duplicate found */
+      break;
+    }
+    else
+    {
+      node->link= (intptr)cursor.curr;
+      DBUG_ASSERT(node->link != (intptr)node); /* no circular references */
+      DBUG_ASSERT(cursor.prev != &node->link); /* no circular references */
+      if (my_atomic_casptr((void **)cursor.prev, (void **)&cursor.curr, node))
+      {
+        res= 1; /* inserted ok */
+        break;
+      }
+    }
+  }
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  /*
+    Note that cursor.curr is not pinned here and the pointer is unreliable,
+    the object may dissapear anytime. But if it points to a dummy node, the
+    pointer is safe, because dummy nodes are never freed - initialize_bucket()
+    uses this fact.
+  */
+  return res ? 0 : cursor.curr;
+}
+
+/*
+  DESCRIPTION
+    deletes a node as identified by hashnr/keey/keylen from the list
+    that starts from 'head'
+
+  RETURN
+    0 - ok
+    1 - not found
+
+  NOTE
+    it uses pins[0..2], on return all pins are removed.
+*/
+static int ldelete(LF_SLIST * volatile *head, CHARSET_INFO *cs, uint32 hashnr,
+                   const uchar *key, uint keylen, LF_PINS *pins)
+{
+  CURSOR cursor;
+  int res;
+
+  for (;;)
+  {
+    if (!lfind(head, cs, hashnr, key, keylen, &cursor, pins))
+    {
+      res= 1; /* not found */
+      break;
+    }
+    else
+    {
+      /* mark the node deleted */
+      if (my_atomic_casptr((void **)&(cursor.curr->link),
+                           (void **)&cursor.next,
+                           (void *)(((intptr)cursor.next) | 1)))
+      {
+        /* and remove it from the list */
+        if (my_atomic_casptr((void **)cursor.prev,
+                             (void **)&cursor.curr, cursor.next))
+          _lf_alloc_free(pins, cursor.curr);
+        else
+        {
+          /*
+            somebody already "helped" us and removed the node ?
+            Let's check if we need to help that someone too!
+            (to ensure the number of "set DELETED flag" actions
+            is equal to the number of "remove from the list" actions)
+          */
+          lfind(head, cs, hashnr, key, keylen, &cursor, pins);
+        }
+        res= 0;
+        break;
+      }
+    }
+  }
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  return res;
+}
+
+/*
+  DESCRIPTION
+    searches for a node as identified by hashnr/keey/keylen in the list
+    that starts from 'head'
+
+  RETURN
+    0 - not found
+    node - found
+
+  NOTE
+    it uses pins[0..2], on return the pin[2] keeps the node found
+    all other pins are removed.
+*/
+static LF_SLIST *lsearch(LF_SLIST * volatile *head, CHARSET_INFO *cs,
+                         uint32 hashnr, const uchar *key, uint keylen,
+                         LF_PINS *pins)
+{
+  CURSOR cursor;
+  int res= lfind(head, cs, hashnr, key, keylen, &cursor, pins);
+  if (res)
+    _lf_pin(pins, 2, cursor.curr);
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  return res ? cursor.curr : 0;
+}
+
+static inline const uchar* hash_key(const LF_HASH *hash,
+                                    const uchar *record, size_t *length)
+{
+  if (hash->get_key)
+    return (*hash->get_key)(record, length, 0);
+  *length= hash->key_length;
+  return record + hash->key_offset;
+}
+
+/*
+  Compute the hash key value from the raw key.
+
+  @note, that the hash value is limited to 2^31, because we need one
+  bit to distinguish between normal and dummy nodes.
+*/
+static inline uint calc_hash(LF_HASH *hash, const uchar *key, uint keylen)
+{
+  ulong nr1= 1, nr2= 4;
+  hash->charset->coll->hash_sort(hash->charset, (uchar*) key, keylen,
+                                 &nr1, &nr2);
+  return nr1 & INT_MAX32;
+}
+
+#define MAX_LOAD 1.0    /* average number of elements in a bucket */
+
+static int initialize_bucket(LF_HASH *, LF_SLIST * volatile*, uint, LF_PINS *);
+
+/*
+  Initializes lf_hash, the arguments are compatible with hash_init
+
+  @note element_size sets both the size of allocated memory block for
+  lf_alloc and a size of memcpy'ed block size in lf_hash_insert. Typically
+  they are the same, indeed. But LF_HASH::element_size can be decreased
+  after lf_hash_init, and then lf_alloc will allocate larger block that
+  lf_hash_insert will copy over. It is desireable if part of the element
+  is expensive to initialize - for example if there is a mutex or
+  DYNAMIC_ARRAY. In this case they should be initialize in the
+  LF_ALLOCATOR::constructor, and lf_hash_insert should not overwrite them.
+  See wt_init() for example.
+*/
+void lf_hash_init(LF_HASH *hash, uint element_size, uint flags,
+                  uint key_offset, uint key_length, my_hash_get_key get_key,
+                  CHARSET_INFO *charset)
+{
+  lf_alloc_init(&hash->alloc, sizeof(LF_SLIST)+element_size,
+                offsetof(LF_SLIST, key));
+  lf_dynarray_init(&hash->array, sizeof(LF_SLIST *));
+  hash->size= 1;
+  hash->count= 0;
+  hash->element_size= element_size;
+  hash->flags= flags;
+  hash->charset= charset ? charset : &my_charset_bin;
+  hash->key_offset= key_offset;
+  hash->key_length= key_length;
+  hash->get_key= get_key;
+  DBUG_ASSERT(get_key ? !key_offset && !key_length : key_length);
+}
+
+void lf_hash_destroy(LF_HASH *hash)
+{
+  LF_SLIST *el, **head= (LF_SLIST **)_lf_dynarray_value(&hash->array, 0);
+
+  if (unlikely(!head))
+    return;
+  el= *head;
+
+  while (el)
+  {
+    intptr next= el->link;
+    if (el->hashnr & 1)
+      lf_alloc_direct_free(&hash->alloc, el); /* normal node */
+    else
+      my_free((void *)el, MYF(0)); /* dummy node */
+    el= (LF_SLIST *)next;
+  }
+  lf_alloc_destroy(&hash->alloc);
+  lf_dynarray_destroy(&hash->array);
+}
+
+/*
+  DESCRIPTION
+    inserts a new element to a hash. it will have a _copy_ of
+    data, not a pointer to it.
+
+  RETURN
+    0 - inserted
+    1 - didn't (unique key conflict)
+   -1 - out of memory
+
+  NOTE
+    see linsert() for pin usage notes
+*/
+int lf_hash_insert(LF_HASH *hash, LF_PINS *pins, const void *data)
+{
+  int csize, bucket, hashnr;
+  LF_SLIST *node, * volatile *el;
+
+  lf_rwlock_by_pins(pins);
+  node= (LF_SLIST *)_lf_alloc_new(pins);
+  if (unlikely(!node))
+    return -1;
+  memcpy(node+1, data, hash->element_size);
+  node->key= hash_key(hash, (uchar *)(node+1), &node->keylen);
+  hashnr= calc_hash(hash, node->key, node->keylen);
+  bucket= hashnr % hash->size;
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return -1;
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return -1;
+  node->hashnr= my_reverse_bits(hashnr) | 1; /* normal node */
+  if (linsert(el, hash->charset, node, pins, hash->flags))
+  {
+    _lf_alloc_free(pins, node);
+    lf_rwunlock_by_pins(pins);
+    return 1;
+  }
+  csize= hash->size;
+  if ((my_atomic_add32(&hash->count, 1)+1.0) / csize > MAX_LOAD)
+    my_atomic_cas32(&hash->size, &csize, csize*2);
+  lf_rwunlock_by_pins(pins);
+  return 0;
+}
+
+/*
+  DESCRIPTION
+    deletes an element with the given key from the hash (if a hash is
+    not unique and there're many elements with this key - the "first"
+    matching element is deleted)
+  RETURN
+    0 - deleted
+    1 - didn't (not found)
+   -1 - out of memory
+  NOTE
+    see ldelete() for pin usage notes
+*/
+int lf_hash_delete(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen)
+{
+  LF_SLIST * volatile *el;
+  uint bucket, hashnr= calc_hash(hash, (uchar *)key, keylen);
+
+  bucket= hashnr % hash->size;
+  lf_rwlock_by_pins(pins);
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return -1;
+  /*
+    note that we still need to initialize_bucket here,
+    we cannot return "node not found", because an old bucket of that
+    node may've been split and the node was assigned to a new bucket
+    that was never accessed before and thus is not initialized.
+  */
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return -1;
+  if (ldelete(el, hash->charset, my_reverse_bits(hashnr) | 1,
+              (uchar *)key, keylen, pins))
+  {
+    lf_rwunlock_by_pins(pins);
+    return 1;
+  }
+  my_atomic_add32(&hash->count, -1);
+  lf_rwunlock_by_pins(pins);
+  return 0;
+}
+
+/*
+  RETURN
+    a pointer to an element with the given key (if a hash is not unique and
+    there're many elements with this key - the "first" matching element)
+    NULL         if nothing is found
+    MY_ERRPTR    if OOM
+
+  NOTE
+    see lsearch() for pin usage notes
+*/
+void *lf_hash_search(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen)
+{
+  LF_SLIST * volatile *el, *found;
+  uint bucket, hashnr= calc_hash(hash, (uchar *)key, keylen);
+
+  bucket= hashnr % hash->size;
+  lf_rwlock_by_pins(pins);
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return MY_ERRPTR;
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return MY_ERRPTR;
+  found= lsearch(el, hash->charset, my_reverse_bits(hashnr) | 1,
+                 (uchar *)key, keylen, pins);
+  lf_rwunlock_by_pins(pins);
+  return found ? found+1 : 0;
+}
+
+static const uchar *dummy_key= (uchar*)"";
+
+/*
+  RETURN
+    0 - ok
+   -1 - out of memory
+*/
+static int initialize_bucket(LF_HASH *hash, LF_SLIST * volatile *node,
+                              uint bucket, LF_PINS *pins)
+{
+  uint parent= my_clear_highest_bit(bucket);
+  LF_SLIST *dummy= (LF_SLIST *)my_malloc(sizeof(LF_SLIST), MYF(MY_WME));
+  LF_SLIST **tmp= 0, *cur;
+  LF_SLIST * volatile *el= _lf_dynarray_lvalue(&hash->array, parent);
+  if (unlikely(!el || !dummy))
+    return -1;
+  if (*el == NULL && bucket &&
+      unlikely(initialize_bucket(hash, el, parent, pins)))
+    return -1;
+  dummy->hashnr= my_reverse_bits(bucket) | 0; /* dummy node */
+  dummy->key= dummy_key;
+  dummy->keylen= 0;
+  if ((cur= linsert(el, hash->charset, dummy, pins, LF_HASH_UNIQUE)))
+  {
+    my_free((void *)dummy, MYF(0));
+    dummy= cur;
+  }
+  my_atomic_casptr((void **)node, (void **)&tmp, dummy);
+  /*
+    note that if the CAS above failed (after linsert() succeeded),
+    it would mean that some other thread has executed linsert() for
+    the same dummy node, its linsert() failed, it picked up our
+    dummy node (in "dummy= cur") and executed the same CAS as above.
+    Which means that even if CAS above failed we don't need to retry,
+    and we should not free(dummy) - there's no memory leak here
+  */
+  return 0;
+}
diff --git a/mysys/my_atomic.c b/mysys/my_atomic.c
index aa04d55f624..6bc76f0de3c 100644
--- a/mysys/my_atomic.c
+++ b/mysys/my_atomic.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006 MySQL AB, 2008-2009 Sun Microsystems, Inc
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -14,7 +14,7 @@
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
 #include <my_global.h>
-#include <my_pthread.h>
+#include <my_sys.h>
 
 #ifndef HAVE_INLINE
 /* the following will cause all inline functions to be instantiated */
@@ -43,3 +43,32 @@ int my_atomic_initialize()
 #endif
 }
 
+#ifdef SAFE_MUTEX
+#undef pthread_mutex_init
+#undef pthread_mutex_destroy
+#undef pthread_mutex_lock
+#undef pthread_mutex_unlock
+
+void plain_pthread_mutex_init(safe_mutex_t *m)
+{
+  pthread_mutex_init(& m->mutex, NULL);
+}
+
+void plain_pthread_mutex_destroy(safe_mutex_t *m)
+{
+  pthread_mutex_destroy(& m->mutex);
+}
+
+void plain_pthread_mutex_lock(safe_mutex_t *m)
+{
+  pthread_mutex_lock(& m->mutex);
+}
+
+void plain_pthread_mutex_unlock(safe_mutex_t *m)
+{
+  pthread_mutex_unlock(& m->mutex);
+}
+
+#endif
+
+
diff --git a/mysys/my_getncpus.c b/mysys/my_getncpus.c
index 82e87dee2e4..5be961e3bc9 100644
--- a/mysys/my_getncpus.c
+++ b/mysys/my_getncpus.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006 MySQL AB, 2008-2009 Sun Microsystems, Inc
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,24 +16,34 @@
 /* get the number of (online) CPUs */
 
 #include "mysys_priv.h"
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 
 static int ncpus=0;
 
-#ifdef _SC_NPROCESSORS_ONLN
 int my_getncpus()
 {
   if (!ncpus)
+  {
+#ifdef _SC_NPROCESSORS_ONLN
     ncpus= sysconf(_SC_NPROCESSORS_ONLN);
-  return ncpus;
-}
-
+#elif defined(__WIN__)
+    SYSTEM_INFO sysinfo;
+
+    /*
+    * We are not calling GetNativeSystemInfo here because (1) we
+    * don't believe that they return different values for number
+    * of processors and (2) if WOW64 limits processors for Win32
+    * then we don't want to try to override that.
+    */
+    GetSystemInfo(&sysinfo);
+
+    ncpus= sysinfo.dwNumberOfProcessors;
 #else
-/* unknown */
-int my_getncpus()
-{
-  return 2;
-}
-
+/* unknown so play safe: assume SMP and forbid uniprocessor build */
+    ncpus= 2;
 #endif
-
+  }
+  return ncpus;
+}
diff --git a/mysys/my_thr_init.c b/mysys/my_thr_init.c
index b2972faf0f8..ba59c483012 100644
--- a/mysys/my_thr_init.c
+++ b/mysys/my_thr_init.c
@@ -284,6 +284,9 @@ my_bool my_thread_init(void)
   pthread_cond_init(&tmp->suspend, NULL);
   tmp->init= 1;
 
+  tmp->stack_ends_here= (char*)&tmp +
+                         STACK_DIRECTION * (long)my_thread_stack_size;
+
   pthread_mutex_lock(&THR_LOCK_threads);
   tmp->id= ++thread_id;
   ++THR_thread_count;
diff --git a/unittest/examples/CMakeLists.txt b/unittest/examples/CMakeLists.txt
new file mode 100644
index 00000000000..a5aa5a93985
--- /dev/null
+++ b/unittest/examples/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (C) 2007 MySQL AB
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib
+                    ${CMAKE_SOURCE_DIR}/sql
+                    ${CMAKE_SOURCE_DIR}/regex
+                    ${CMAKE_SOURCE_DIR}/extra/yassl/include
+                    ${CMAKE_SOURCE_DIR}/unittest/mytap)
+ADD_EXECUTABLE(simple-t simple-t.c)
+TARGET_LINK_LIBRARIES(simple-t mytap)
+
+ADD_EXECUTABLE(skip-t skip-t.c)
+TARGET_LINK_LIBRARIES(skip-t mytap)
+
+ADD_EXECUTABLE(todo-t todo-t.c)
+TARGET_LINK_LIBRARIES(todo-t mytap)
+
+ADD_EXECUTABLE(skip_all-t skip_all-t.c)
+TARGET_LINK_LIBRARIES(skip_all-t mytap)
+
+ADD_EXECUTABLE(no_plan-t no_plan-t.c)
+TARGET_LINK_LIBRARIES(no_plan-t mytap)
+
+ADD_EXECUTABLE(core-t core-t.c)
+TARGET_LINK_LIBRARIES(core-t mytap)
diff --git a/unittest/examples/core-t.c b/unittest/examples/core-t.c
index cafe2df9954..a9b798d3064 100644
--- a/unittest/examples/core-t.c
+++ b/unittest/examples/core-t.c
@@ -13,7 +13,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
 
-#include "my_config.h"
+#include <my_global.h>
 
 #include <stdlib.h>
 #include <tap.h>
diff --git a/unittest/examples/no_plan-t.c b/unittest/examples/no_plan-t.c
index 56aabd6d752..06378e81218 100644
--- a/unittest/examples/no_plan-t.c
+++ b/unittest/examples/no_plan-t.c
@@ -13,7 +13,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
 
-#include "my_config.h"
+#include <my_global.h>
 
 #include <stdlib.h>
 #include <tap.h>
diff --git a/unittest/examples/skip_all-t.c b/unittest/examples/skip_all-t.c
index a4c8648fbe4..11c1ef13276 100644
--- a/unittest/examples/skip_all-t.c
+++ b/unittest/examples/skip_all-t.c
@@ -13,7 +13,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
 
-#include "my_config.h"
+#include <my_global.h>
 
 #include <stdlib.h>
 #include <tap.h>
diff --git a/unittest/examples/todo-t.c b/unittest/examples/todo-t.c
index 2de409447ba..027d6d6b65e 100644
--- a/unittest/examples/todo-t.c
+++ b/unittest/examples/todo-t.c
@@ -13,7 +13,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
 
-#include "my_config.h"
+#include <my_global.h>
 
 #include <stdlib.h>
 #include <tap.h>
diff --git a/unittest/mysys/Makefile.am b/unittest/mysys/Makefile.am
index 56c65d71396..d83b2909048 100644
--- a/unittest/mysys/Makefile.am
+++ b/unittest/mysys/Makefile.am
@@ -16,12 +16,14 @@
 AM_CPPFLAGS      = @ZLIB_INCLUDES@ -I$(top_builddir)/include 
 AM_CPPFLAGS     += -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
 
+noinst_HEADERS = thr_template.c
+
 LDADD 		= $(top_builddir)/unittest/mytap/libmytap.a \
 		  $(top_builddir)/mysys/libmysys.a \
 		  $(top_builddir)/dbug/libdbug.a \
 		  $(top_builddir)/strings/libmystrings.a
 
-noinst_PROGRAMS  = bitmap-t base64-t my_vsnprintf-t
+noinst_PROGRAMS  = bitmap-t base64-t lf-t my_vsnprintf-t
 
 if NEED_THREAD
 # my_atomic-t is used to check thread functions, so it is safe to 
diff --git a/unittest/mysys/lf-t.c b/unittest/mysys/lf-t.c
new file mode 100644
index 00000000000..61b7ae08cf5
--- /dev/null
+++ b/unittest/mysys/lf-t.c
@@ -0,0 +1,178 @@
+/* Copyright (C) 2008-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+
+  Unit tests for lock-free algorithms of mysys
+*/
+
+#include "thr_template.c"
+
+#include <lf.h>
+
+int32 inserts= 0, N;
+LF_ALLOCATOR lf_allocator;
+LF_HASH lf_hash;
+
+/*
+  pin allocator - alloc and release an element in a loop
+*/
+pthread_handler_t test_lf_pinbox(void *arg)
+{
+  int    m= *(int *)arg;
+  int32 x= 0;
+  LF_PINS *pins;
+
+  my_thread_init();
+
+  pins= lf_pinbox_get_pins(&lf_allocator.pinbox);
+
+  for (x= ((int)(intptr)(&m)); m ; m--)
+  {
+    lf_pinbox_put_pins(pins);
+    pins= lf_pinbox_get_pins(&lf_allocator.pinbox);
+  }
+  lf_pinbox_put_pins(pins);
+  pthread_mutex_lock(&mutex);
+  if (!--running_threads) pthread_cond_signal(&cond);
+  pthread_mutex_unlock(&mutex);
+  my_thread_end();
+  return 0;
+}
+
+/*
+  thread local data area, allocated using lf_alloc.
+  union is required to enforce the minimum required element size (sizeof(ptr))
+*/
+typedef union {
+  int32 data;
+  void *not_used;
+} TLA;
+
+pthread_handler_t test_lf_alloc(void *arg)
+{
+  int    m= (*(int *)arg)/2;
+  int32 x,y= 0;
+  LF_PINS *pins;
+
+  my_thread_init();
+
+  pins= lf_alloc_get_pins(&lf_allocator);
+
+  for (x= ((int)(intptr)(&m)); m ; m--)
+  {
+    TLA *node1, *node2;
+    x= (x*m+0x87654321) & INT_MAX32;
+    node1= (TLA *)lf_alloc_new(pins);
+    node1->data= x;
+    y+= node1->data;
+    node1->data= 0;
+    node2= (TLA *)lf_alloc_new(pins);
+    node2->data= x;
+    y-= node2->data;
+    node2->data= 0;
+    lf_alloc_free(pins, node1);
+    lf_alloc_free(pins, node2);
+  }
+  lf_alloc_put_pins(pins);
+  pthread_mutex_lock(&mutex);
+  bad+= y;
+
+  if (--N == 0)
+  {
+    diag("%d mallocs, %d pins in stack",
+         lf_allocator.mallocs, lf_allocator.pinbox.pins_in_array);
+#ifdef MY_LF_EXTRA_DEBUG
+    bad|= lf_allocator.mallocs - lf_alloc_pool_count(&lf_allocator);
+#endif
+  }
+  if (!--running_threads) pthread_cond_signal(&cond);
+  pthread_mutex_unlock(&mutex);
+  my_thread_end();
+  return 0;
+}
+
+#define N_TLH 1000
+pthread_handler_t test_lf_hash(void *arg)
+{
+  int    m= (*(int *)arg)/(2*N_TLH);
+  int32 x,y,z,sum= 0, ins= 0;
+  LF_PINS *pins;
+
+  my_thread_init();
+
+  pins= lf_hash_get_pins(&lf_hash);
+
+  for (x= ((int)(intptr)(&m)); m ; m--)
+  {
+    int i;
+    y= x;
+    for (i= 0; i < N_TLH; i++)
+    {
+      x= (x*(m+i)+0x87654321) & INT_MAX32;
+      z= (x<0) ? -x : x;
+      if (lf_hash_insert(&lf_hash, pins, &z))
+      {
+        sum+= z;
+        ins++;
+      }
+    }
+    for (i= 0; i < N_TLH; i++)
+    {
+      y= (y*(m+i)+0x87654321) & INT_MAX32;
+      z= (y<0) ? -y : y;
+      if (lf_hash_delete(&lf_hash, pins, (uchar *)&z, sizeof(z)))
+        sum-= z;
+    }
+  }
+  lf_hash_put_pins(pins);
+  pthread_mutex_lock(&mutex);
+  bad+= sum;
+  inserts+= ins;
+
+  if (--N == 0)
+  {
+    diag("%d mallocs, %d pins in stack, %d hash size, %d inserts",
+         lf_hash.alloc.mallocs, lf_hash.alloc.pinbox.pins_in_array,
+         lf_hash.size, inserts);
+    bad|= lf_hash.count;
+  }
+  if (!--running_threads) pthread_cond_signal(&cond);
+  pthread_mutex_unlock(&mutex);
+  my_thread_end();
+  return 0;
+}
+
+
+void do_tests()
+{
+  plan(4);
+
+  lf_alloc_init(&lf_allocator, sizeof(TLA), offsetof(TLA, not_used));
+  lf_hash_init(&lf_hash, sizeof(int), LF_HASH_UNIQUE, 0, sizeof(int), 0,
+               &my_charset_bin);
+
+  bad= my_atomic_initialize();
+  ok(!bad, "my_atomic_initialize() returned %d", bad);
+
+  test_concurrently("lf_pinbox", test_lf_pinbox, N= THREADS, CYCLES);
+  test_concurrently("lf_alloc",  test_lf_alloc,  N= THREADS, CYCLES);
+  test_concurrently("lf_hash",   test_lf_hash,   N= THREADS, CYCLES/10);
+
+  lf_hash_destroy(&lf_hash);
+  lf_alloc_destroy(&lf_allocator);
+}
+
diff --git a/unittest/mysys/my_atomic-t.c b/unittest/mysys/my_atomic-t.c
index f2bcd360508..1a558e8cb73 100644
--- a/unittest/mysys/my_atomic-t.c
+++ b/unittest/mysys/my_atomic-t.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006 MySQL AB
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -13,10 +13,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
-#include <my_global.h>
-#include <my_sys.h>
-#include <my_atomic.h>
-#include <tap.h>
+#include "thr_template.c"
 
 /* at least gcc 3.4.5 and 3.4.6 (but not 3.2.3) on RHEL */
 #if __GNUC__ == 3 && __GNUC_MINOR__ == 4
@@ -25,181 +22,125 @@
 #define GCC_BUG_WORKAROUND
 #endif
 
-int32 a32,b32,c32;
+volatile uint32 b32;
+volatile int32  c32;
 my_atomic_rwlock_t rwl;
 
-pthread_attr_t thr_attr;
-pthread_mutex_t mutex;
-pthread_cond_t cond;
-int N;
-
 /* add and sub a random number in a loop. Must get 0 at the end */
-pthread_handler_t test_atomic_add_handler(void *arg)
+pthread_handler_t test_atomic_add(void *arg)
 {
-  int    m=*(int *)arg;
+  int    m= (*(int *)arg)/2;
   GCC_BUG_WORKAROUND int32 x;
-  for (x=((int)((long)(&m))); m ; m--)
+  for (x= ((int)(intptr)(&m)); m ; m--)
   {
-    x=x*m+0x87654321;
+    x= (x*m+0x87654321) & INT_MAX32;
     my_atomic_rwlock_wrlock(&rwl);
-    my_atomic_add32(&a32, x);
+    my_atomic_add32(&bad, x);
     my_atomic_rwlock_wrunlock(&rwl);
 
     my_atomic_rwlock_wrlock(&rwl);
-    my_atomic_add32(&a32, -x);
+    my_atomic_add32(&bad, -x);
     my_atomic_rwlock_wrunlock(&rwl);
   }
   pthread_mutex_lock(&mutex);
-  N--;
-  if (!N) pthread_cond_signal(&cond);
+  if (!--running_threads) pthread_cond_signal(&cond);
   pthread_mutex_unlock(&mutex);
   return 0;
 }
 
 /*
   1. generate thread number 0..N-1 from b32
-  2. add it to a32
+  2. add it to bad
   3. swap thread numbers in c32
   4. (optionally) one more swap to avoid 0 as a result
-  5. subtract result from a32
-  must get 0 in a32 at the end
+  5. subtract result from bad
+  must get 0 in bad at the end
 */
-pthread_handler_t test_atomic_swap_handler(void *arg)
+pthread_handler_t test_atomic_fas(void *arg)
 {
-  int    m=*(int *)arg;
-  int32 x;
+  int    m= *(int *)arg;
+  int32  x;
 
   my_atomic_rwlock_wrlock(&rwl);
-  x=my_atomic_add32(&b32, 1);
+  x= my_atomic_add32(&b32, 1);
   my_atomic_rwlock_wrunlock(&rwl);
 
   my_atomic_rwlock_wrlock(&rwl);
-  my_atomic_add32(&a32, x);
+  my_atomic_add32(&bad, x);
   my_atomic_rwlock_wrunlock(&rwl);
 
   for (; m ; m--)
   {
     my_atomic_rwlock_wrlock(&rwl);
-    x=my_atomic_swap32(&c32, x);
+    x= my_atomic_fas32(&c32, x);
     my_atomic_rwlock_wrunlock(&rwl);
   }
 
   if (!x)
   {
     my_atomic_rwlock_wrlock(&rwl);
-    x=my_atomic_swap32(&c32, x);
+    x= my_atomic_fas32(&c32, x);
     my_atomic_rwlock_wrunlock(&rwl);
   }
 
   my_atomic_rwlock_wrlock(&rwl);
-  my_atomic_add32(&a32, -x);
+  my_atomic_add32(&bad, -x);
   my_atomic_rwlock_wrunlock(&rwl);
 
   pthread_mutex_lock(&mutex);
-  N--;
-  if (!N) pthread_cond_signal(&cond);
+  if (!--running_threads) pthread_cond_signal(&cond);
   pthread_mutex_unlock(&mutex);
   return 0;
 }
 
 /*
-  same as test_atomic_add_handler, but my_atomic_add32 is emulated with
-  (slower) my_atomic_cas32
+  same as test_atomic_add, but my_atomic_add32 is emulated with
+  my_atomic_cas32 - notice that the slowdown is proportional to the
+  number of CPUs
 */
-pthread_handler_t test_atomic_cas_handler(void *arg)
+pthread_handler_t test_atomic_cas(void *arg)
 {
-  int    m=*(int *)arg, ok;
-  GCC_BUG_WORKAROUND int32 x,y;
-  for (x=((int)((long)(&m))); m ; m--)
+  int    m= (*(int *)arg)/2, ok= 0;
+  GCC_BUG_WORKAROUND int32 x, y;
+  for (x= ((int)(intptr)(&m)); m ; m--)
   {
     my_atomic_rwlock_wrlock(&rwl);
-    y=my_atomic_load32(&a32);
+    y= my_atomic_load32(&bad);
     my_atomic_rwlock_wrunlock(&rwl);
-
-    x=x*m+0x87654321;
+    x= (x*m+0x87654321) & INT_MAX32;
     do {
       my_atomic_rwlock_wrlock(&rwl);
-      ok=my_atomic_cas32(&a32, &y, y+x);
+      ok= my_atomic_cas32(&bad, &y, (uint32)y+x);
       my_atomic_rwlock_wrunlock(&rwl);
-    } while (!ok);
+    } while (!ok) ;
     do {
       my_atomic_rwlock_wrlock(&rwl);
-      ok=my_atomic_cas32(&a32, &y, y-x);
+      ok= my_atomic_cas32(&bad, &y, y-x);
       my_atomic_rwlock_wrunlock(&rwl);
-    } while (!ok);
+    } while (!ok) ;
   }
   pthread_mutex_lock(&mutex);
-  N--;
-  if (!N) pthread_cond_signal(&cond);
+  if (!--running_threads) pthread_cond_signal(&cond);
   pthread_mutex_unlock(&mutex);
   return 0;
 }
 
-void test_atomic(const char *test, pthread_handler handler, int n, int m)
-{
-  pthread_t t;
-  ulonglong now=my_getsystime();
-
-  a32= 0;
-  b32= 0;
-  c32= 0;
-
-  diag("Testing %s with %d threads, %d iterations... ", test, n, m);
-  for (N=n ; n ; n--)
-  {
-    if (pthread_create(&t, &thr_attr, handler, &m) != 0)
-    {
-      diag("Could not create thread");
-      a32= 1;
-      goto err;
-    }
-  }
-
-  pthread_mutex_lock(&mutex);
-  while (N)
-    pthread_cond_wait(&cond, &mutex);
-  pthread_mutex_unlock(&mutex);
-  now=my_getsystime()-now;
-err:
-  ok(a32 == 0, "tested %s in %g secs", test, ((double)now)/1e7);
-}
 
-int main()
+void do_tests()
 {
-  int err;
-  MY_INIT("my_atomic-t.c");
-
-  diag("N CPUs: %d", my_getncpus());
-  err= my_atomic_initialize();
-
   plan(4);
-  ok(err == 0, "my_atomic_initialize() returned %d", err);
 
-  pthread_attr_init(&thr_attr);
-  pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
-  pthread_mutex_init(&mutex, 0);
-  pthread_cond_init(&cond, 0);
+  bad= my_atomic_initialize();
+  ok(!bad, "my_atomic_initialize() returned %d", bad);
+
   my_atomic_rwlock_init(&rwl);
 
-#ifdef HPUX11
-#define CYCLES 1000
-#else
-#define CYCLES 10000
-#endif
-#define THREADS 100
-  test_atomic("my_atomic_add32", test_atomic_add_handler, THREADS, CYCLES);
-  test_atomic("my_atomic_swap32", test_atomic_swap_handler, THREADS, CYCLES);
-  test_atomic("my_atomic_cas32", test_atomic_cas_handler, THREADS, CYCLES);
-  /*
-    workaround until we know why it crashes randomly on some machine
-    (BUG#22320).
-  */
-  sleep(2);
-
-  pthread_mutex_destroy(&mutex);
-  pthread_cond_destroy(&cond);
-  pthread_attr_destroy(&thr_attr);
+  b32= c32= 0;
+  test_concurrently("my_atomic_add32", test_atomic_add, THREADS, CYCLES);
+  b32= c32= 0;
+  test_concurrently("my_atomic_fas32", test_atomic_fas, THREADS, CYCLES);
+  b32= c32= 0;
+  test_concurrently("my_atomic_cas32", test_atomic_cas, THREADS, CYCLES);
+
   my_atomic_rwlock_destroy(&rwl);
-  return exit_status();
 }
-
diff --git a/unittest/mysys/thr_template.c b/unittest/mysys/thr_template.c
new file mode 100644
index 00000000000..1ac03e474fd
--- /dev/null
+++ b/unittest/mysys/thr_template.c
@@ -0,0 +1,92 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <tap.h>
+
+volatile uint32 bad;
+pthread_attr_t thr_attr;
+pthread_mutex_t mutex;
+pthread_cond_t cond;
+uint running_threads;
+
+void do_tests();
+
+void test_concurrently(const char *test, pthread_handler handler, int n, int m)
+{
+  pthread_t t;
+  ulonglong now= my_getsystime();
+
+  bad= 0;
+
+  diag("Testing %s with %d threads, %d iterations... ", test, n, m);
+  for (running_threads= n ; n ; n--)
+  {
+    if (pthread_create(&t, &thr_attr, handler, &m) != 0)
+    {
+      diag("Could not create thread");
+      abort();
+    }
+  }
+  pthread_mutex_lock(&mutex);
+  while (running_threads)
+    pthread_cond_wait(&cond, &mutex);
+  pthread_mutex_unlock(&mutex);
+
+  now= my_getsystime()-now;
+  ok(!bad, "tested %s in %g secs (%d)", test, ((double)now)/1e7, bad);
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+  MY_INIT("thd_template");
+
+  if (argv[1] && *argv[1])
+    DBUG_SET_INITIAL(argv[1]);
+
+  pthread_mutex_init(&mutex, 0);
+  pthread_cond_init(&cond, 0);
+  pthread_attr_init(&thr_attr);
+  pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
+
+#ifdef MY_ATOMIC_MODE_RWLOCKS
+#if defined(HPUX11) || defined(__POWERPC__) /* showed to be very slow (scheduler-related) */
+#define CYCLES 300
+#else
+#define CYCLES 3000
+#endif
+#else
+#define CYCLES 3000
+#endif
+#define THREADS 30
+
+  diag("N CPUs: %d, atomic ops: %s", my_getncpus(), MY_ATOMIC_MODE);
+
+  do_tests();
+
+  /*
+    workaround until we know why it crashes randomly on some machine
+    (BUG#22320).
+  */
+  sleep(2);
+  pthread_mutex_destroy(&mutex);
+  pthread_cond_destroy(&cond);
+  pthread_attr_destroy(&thr_attr);
+  my_end(0);
+  return exit_status();
+}
+
diff --git a/unittest/mytap/CMakeLists.txt b/unittest/mytap/CMakeLists.txt
new file mode 100644
index 00000000000..9875f46697d
--- /dev/null
+++ b/unittest/mytap/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Copyright (C) 2007 MySQL AB
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib
+                    ${CMAKE_SOURCE_DIR}/sql
+                    ${CMAKE_SOURCE_DIR}/regex
+                    ${CMAKE_SOURCE_DIR}/extra/yassl/include)
+ADD_LIBRARY(mytap tap.c)
diff --git a/unittest/mytap/Makefile.am b/unittest/mytap/Makefile.am
index c02bcd3b49d..d36dc25d0b5 100644
--- a/unittest/mytap/Makefile.am
+++ b/unittest/mytap/Makefile.am
@@ -13,14 +13,13 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
-AM_CPPFLAGS  = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(srcdir)
+AM_CPPFLAGS	   = -I$(top_srcdir)/include
 
 noinst_LIBRARIES   = libmytap.a
 noinst_HEADERS     = tap.h
 
 libmytap_a_SOURCES = tap.c
 
-SUBDIRS            = . t
+EXTRA_DIST         = CMakeLists.txt 
 
-# Don't update the files from bitkeeper
-%::SCCS/s.%
+SUBDIRS            = . t
diff --git a/unittest/mytap/tap.c b/unittest/mytap/tap.c
index 4e053e3e745..5cdbfeb428c 100644
--- a/unittest/mytap/tap.c
+++ b/unittest/mytap/tap.c
@@ -19,7 +19,7 @@
 
 #include "tap.h"
 
-#include "my_config.h"
+#include "my_global.h"
 
 #include <stdlib.h>
 #include <stdarg.h>
@@ -27,6 +27,16 @@
 #include <string.h>
 #include <signal.h>
 
+/*
+  Visual Studio 2003 does not know vsnprintf but knows _vsnprintf.
+  We don't put this #define in config-win.h because we prefer
+  my_vsnprintf everywhere instead, except when linking with libmysys
+  is not desirable - the case here.
+*/
+#if defined(_MSC_VER) && ( _MSC_VER == 1310 )
+#define vsnprintf _vsnprintf
+#endif
+
 /**
    @defgroup MyTAP_Internal MyTAP Internals
 
@@ -150,8 +160,10 @@ static signal_entry install_signal[]= {
   { SIGILL,  handle_core_signal },
   { SIGABRT, handle_core_signal },
   { SIGFPE,  handle_core_signal },
-  { SIGSEGV, handle_core_signal },
-  { SIGBUS,  handle_core_signal }
+  { SIGSEGV, handle_core_signal }
+#ifdef SIGBUS
+  , { SIGBUS,  handle_core_signal }
+#endif
 #ifdef SIGXCPU
   , { SIGXCPU, handle_core_signal }
 #endif
@@ -166,13 +178,22 @@ static signal_entry install_signal[]= {
 #endif
 };
 
+int skip_big_tests= 1;
+
 void
 plan(int const count)
 {
+  char *config= getenv("MYTAP_CONFIG");
+  size_t i;
+
+  if (config)
+    skip_big_tests= strcmp(config, "big");
+
+  setvbuf(tapout, 0, _IONBF, 0);  /* provide output at once */
   /*
     Install signal handler
   */
-  size_t i;
+
   for (i= 0; i < sizeof(install_signal)/sizeof(*install_signal); ++i)
     signal(install_signal[i].signo, install_signal[i].handler);
 
diff --git a/unittest/mytap/tap.h b/unittest/mytap/tap.h
index 31ec47d1ef2..d8f617c88fb 100644
--- a/unittest/mytap/tap.h
+++ b/unittest/mytap/tap.h
@@ -62,6 +62,24 @@ extern "C" {
 #endif
 
 /**
+   Defines whether "big" tests should be skipped.
+
+   This variable is set by plan() function unless MYTAP_CONFIG environment
+   variable is set to the string "big".  It is supposed to be used as
+
+   @code
+   if (skip_big_tests) {
+     skip(1, "Big test skipped");
+   } else {
+     ok(life_universe_and_everything() == 42, "The answer is CORRECT");
+   }
+   @endcode
+
+   @see SKIP_BIG_TESTS
+*/
+extern int skip_big_tests;
+
+/**
   @defgroup MyTAP_API MyTAP API
 
   MySQL support for performing unit tests according to TAP.
@@ -81,10 +99,15 @@ extern "C" {
    that generate a core, so if you want to override these signals, do
    it <em>after</em> you have called the plan() function.
 
-   @param count The planned number of tests to run. 
+   It will also set skip_big_tests variable if MYTAP_CONFIG environment
+   variable is defined.
+
+   @see skip_big_tests
+
+   @param count The planned number of tests to run.
 */
 
-void plan(int count);
+void plan(int const count);
 
 
 /**
@@ -103,7 +126,7 @@ void plan(int count);
                which case nothing is printed.
 */
 
-void ok(int pass, char const *fmt, ...)
+void ok(int const pass, char const *fmt, ...)
   __attribute__((format(printf,2,3)));
 
 
@@ -135,7 +158,7 @@ void ok(int pass, char const *fmt, ...)
    @param reason     A reason for skipping the tests
  */
 
-void skip(int how_many, char const *reason, ...)
+void skip(int how_many, char const *const reason, ...)
     __attribute__((format(printf,2,3)));
 
 
@@ -161,6 +184,24 @@ void skip(int how_many, char const *reason, ...)
 
 
 /**
+   Helper macro to skip a group of "big" tests. It is used in the following
+   manner:
+
+   @code
+   SKIP_BIG_TESTS(1)
+   {
+     ok(life_universe_and_everything() == 42, "The answer is CORRECT");
+   }
+   @endcode
+
+   @see skip_big_tests
+ */
+
+#define SKIP_BIG_TESTS(COUNT) \
+  if (skip_big_tests) skip((COUNT), "big test"); else
+
+
+/**
    Print a diagnostics message.
 
    @param fmt  Diagnostics message in printf() format.
diff --git a/unittest/unit.pl b/unittest/unit.pl
index 9d328985012..a1aab376fdf 100644
--- a/unittest/unit.pl
+++ b/unittest/unit.pl
@@ -14,8 +14,9 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
-use Test::Harness qw(&runtests $verbose);
+use Test::Harness;
 use File::Find;
+use Getopt::Long;
 
 use strict;
 
@@ -31,10 +32,19 @@ unit - Run unit tests in directory
 
 =head1 SYNOPSIS
 
-  unit run
+  unit [--[no]big] [--[no]verbose] run [tests to run]
 
 =cut
 
+my $big= $ENV{'MYTAP_CONFIG'} eq 'big';
+
+my $result = GetOptions (
+  "big!"        => \$big,
+  "verbose!"    => \$Test::Harness::verbose,
+);
+
+$ENV{'MYTAP_CONFIG'} = $big ? 'big' : '';
+
 my $cmd = shift;
 
 if (defined $cmd && exists $dispatch{$cmd}) {
@@ -56,7 +66,7 @@ sub _find_test_files (@) {
     my @files;
     find sub { 
         $File::Find::prune = 1 if /^SCCS$/;
-        push(@files, $File::Find::name) if -x _ && /-t\z/;
+        push(@files, $File::Find::name) if -x _ && (/-t\z/ || /-t\.exe\z/);
     }, @dirs;
     return @files;
 }
@@ -92,7 +102,7 @@ sub run_cmd (@) {
     if (@files > 0) {
         # Removing the first './' from the file names
         foreach (@files) { s!^\./!! }
-        $ENV{'HARNESS_PERL_SWITCHES'} .= q" -e 'exec @ARGV'";
+        $ENV{'HARNESS_PERL_SWITCHES'} .= ' -e "exec @ARGV"';
         runtests @files;
     }
 }