52 files changed, 4673 insertions, 645 deletions
diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt
index 545278485d1..8552eae3974 100755..100644
--- a/mysys/CMakeLists.txt
+++ b/mysys/CMakeLists.txt
@@ -27,11 +27,11 @@ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/zlib ${CMAKE_SOURCE_DIR}/include ${CMAKE
 
 SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c default.c default_modify.c
 				errors.c hash.c list.c md5.c mf_brkhant.c mf_cache.c mf_dirname.c mf_fn_ext.c
-				mf_format.c mf_getdate.c mf_iocache.c mf_iocache2.c mf_keycache.c 
+				mf_format.c mf_getdate.c mf_iocache.c mf_iocache2.c mf_keycache.c my_safehash.c
 				mf_keycaches.c mf_loadpath.c mf_pack.c mf_path.c mf_qsort.c mf_qsort2.c
 				mf_radix.c mf_same.c mf_sort.c mf_soundex.c mf_strip.c mf_arr_appstr.c mf_tempdir.c
 				mf_tempfile.c mf_unixpath.c mf_wcomp.c mf_wfile.c mulalloc.c my_access.c
-				my_aes.c my_alarm.c my_alloc.c my_append.c my_bit.c my_bitmap.c my_chsize.c
+				my_aes.c my_alarm.c my_alloc.c my_append.c my_bit.c my_bitmap.c my_chmod.c my_chsize.c
 				my_clock.c my_compress.c my_conio.c my_copy.c my_crc32.c my_create.c my_delete.c
 				my_div.c my_error.c my_file.c my_fopen.c my_fstream.c my_gethostbyname.c 
 				my_gethwaddr.c my_getopt.c my_getsystime.c my_getwd.c my_handler.c my_init.c
@@ -41,7 +41,11 @@ SET(MYSYS_SOURCES  array.c charset-def.c charset.c checksum.c default.c default_
 				my_static.c my_symlink.c my_symlink2.c my_sync.c my_thr_init.c my_wincond.c
 				my_windac.c my_winthread.c my_write.c ptr_cmp.c queues.c stacktrace.c
 				rijndael.c safemalloc.c sha1.c string.c thr_alarm.c thr_lock.c thr_mutex.c
-				thr_rwlock.c tree.c typelib.c my_vle.c base64.c my_memmem.c my_getpagesize.c)
+				thr_rwlock.c tree.c typelib.c my_vle.c base64.c my_memmem.c my_getpagesize.c
+                                lf_alloc-pin.c lf_dynarray.c lf_hash.c
+                                my_atomic.c my_getncpus.c my_rnd.c
+                                my_uuid.c wqueue.c waiting_threads.c
+)
 
 IF(NOT SOURCE_SUBLIBS)
   ADD_LIBRARY(mysys ${MYSYS_SOURCES})
diff --git a/mysys/Makefile.am b/mysys/Makefile.am
index 3312c692c09..6efdd0d75e7 100644
--- a/mysys/Makefile.am
+++ b/mysys/Makefile.am
@@ -20,18 +20,21 @@ INCLUDES =		@ZLIB_INCLUDES@ -I$(top_builddir)/include \
 			-I$(top_srcdir)/include -I$(srcdir)
 pkglib_LIBRARIES =	libmysys.a
 LDADD =			libmysys.a $(top_builddir)/strings/libmystrings.a $(top_builddir)/dbug/libdbug.a
-noinst_HEADERS =	mysys_priv.h my_static.h my_handler_errors.h
+noinst_HEADERS =	mysys_priv.h my_static.h my_handler_errors.h my_safehash.h
 libmysys_a_SOURCES =    my_init.c my_getwd.c mf_getdate.c my_mmap.c \
 			mf_path.c mf_loadpath.c my_file.c \
 			my_open.c my_create.c my_dup.c my_seek.c my_read.c \
 			my_pread.c my_write.c my_getpagesize.c \
+			my_safehash.c \
 			mf_keycache.c mf_keycaches.c my_crc32.c \
 			mf_iocache.c mf_iocache2.c mf_cache.c mf_tempfile.c \
 			mf_tempdir.c my_lock.c mf_brkhant.c my_alarm.c \
 			my_malloc.c my_realloc.c my_once.c mulalloc.c \
 			my_alloc.c safemalloc.c my_new.cc \
-			my_vle.c my_atomic.c \
+			my_vle.c my_atomic.c lf_hash.c \
+			lf_dynarray.c lf_alloc-pin.c \
 			my_fopen.c my_fstream.c my_getsystime.c \
+			my_rnd.c my_uuid.c \
 			my_error.c errors.c my_div.c my_messnc.c \
 			mf_format.c mf_same.c mf_dirname.c mf_fn_ext.c \
 			my_symlink.c my_symlink2.c \
@@ -42,7 +45,7 @@ libmysys_a_SOURCES =    my_init.c my_getwd.c mf_getdate.c my_mmap.c \
 			tree.c trie.c list.c hash.c array.c string.c typelib.c \
 			my_copy.c my_append.c my_lib.c \
 			my_delete.c my_rename.c my_redel.c \
-			my_chsize.c my_clock.c \
+			my_chsize.c my_chmod.c my_clock.c \
 			my_quick.c my_lockmem.c my_static.c \
 			my_sync.c my_getopt.c my_mkdir.c \
 			default_modify.c default.c \
@@ -52,12 +55,14 @@ libmysys_a_SOURCES =    my_init.c my_getwd.c mf_getdate.c my_mmap.c \
 			my_gethostbyname.c rijndael.c my_aes.c sha1.c \
 			my_handler.c my_netware.c my_largepage.c \
 			my_memmem.c stacktrace.c \
-			my_windac.c my_access.c base64.c my_libwrap.c
-EXTRA_DIST =		thr_alarm.c thr_lock.c my_pthread.c my_thr_init.c \
-			thr_mutex.c thr_rwlock.c \
-			CMakeLists.txt mf_soundex.c \
+			my_windac.c my_access.c base64.c my_libwrap.c \
+		        wqueue.c
+if THREAD
+libmysys_a_SOURCES+=	thr_alarm.c thr_lock.c my_pthread.c my_thr_init.c \
+			thr_mutex.c thr_rwlock.c waiting_threads.c
+endif
+EXTRA_DIST =		CMakeLists.txt mf_soundex.c \
 			my_conio.c my_wincond.c my_winthread.c
-libmysys_a_LIBADD =	@THREAD_LOBJECTS@
 # test_dir_DEPENDENCIES=	$(LIBRARIES)
 # testhash_DEPENDENCIES=	$(LIBRARIES)
 # test_charset_DEPENDENCIES=	$(LIBRARIES)
@@ -71,11 +76,16 @@ DEFS =			-DDEFAULT_BASEDIR=\"$(prefix)\" \
 			-DDEFAULT_SYSCONFDIR="\"$(sysconfdir)\"" \
                         @DEFS@
 
-libmysys_a_DEPENDENCIES= @THREAD_LOBJECTS@
-
 # I hope this always does the right thing. Otherwise this is only test programs
 FLAGS=$(DEFS) $(INCLUDES) $(CPPFLAGS) $(CFLAGS) @NOINST_LDFLAGS@
 
+CLEANFILES =		test_bitmap$(EXEEXT) test_priority_queue$(EXEEXT) \
+			test_thr_alarm$(EXEEXT) test_thr_lock$(EXEEXT) \
+			test_vsnprintf$(EXEEXT) test_io_cache$(EXEEXT) \
+			test_dir$(EXEEXT) test_charset$(EXEEXT) \
+			testhash$(EXEEXT) test_gethwaddr$(EXEEXT) \
+			test_base64$(EXEEXT) test_thr_mutex$(EXEEXT)
+
 #
 # The CP .. RM stuff is to avoid problems with some compilers (like alpha ccc)
 # which automaticly removes the object files you use to compile a final program
@@ -126,5 +136,9 @@ test_base64$(EXEEXT): base64.c $(LIBRARIES)
 	$(LINK) $(FLAGS) -DMAIN  ./test_base64.c $(LDADD) $(LIBS)
 	$(RM) -f ./test_base64.c
 
+test_thr_mutex$(EXEEXT): test_thr_mutex.c $(LIBRARIES)
+	$(LINK) $(FLAGS) $(srcdir)/test_thr_mutex.c $(LDADD) $(LIBS)
+
+
 # Don't update the files from bitkeeper
 %::SCCS/s.%
diff --git a/mysys/array.c b/mysys/array.c
index 92940717c90..62d6b1ed4e9 100644
--- a/mysys/array.c
+++ b/mysys/array.c
@@ -30,8 +30,8 @@
       alloc_increment	Increment for adding new elements
 
   DESCRIPTION
-    init_dynamic_array() initiates array and allocate space for 
-    init_alloc eilements. 
+    init_dynamic_array() initiates array and allocate space for
+    init_alloc eilements.
     Array is usable even if space allocation failed.
     Static buffers must begin immediately after the array structure.
 
@@ -41,7 +41,7 @@
 */
 
 my_bool init_dynamic_array2(DYNAMIC_ARRAY *array, uint element_size,
-                            void *init_buffer, uint init_alloc, 
+                            void *init_buffer, uint init_alloc,
                             uint alloc_increment CALLER_INFO_PROTO)
 {
   DBUG_ENTER("init_dynamic_array");
@@ -51,33 +51,28 @@ my_bool init_dynamic_array2(DYNAMIC_ARRAY *array, uint element_size,
     if (init_alloc > 8 && alloc_increment > init_alloc * 2)
       alloc_increment=init_alloc*2;
   }
-
-  if (!init_alloc)
-  {
-    init_alloc=alloc_increment;
-    init_buffer= 0;
-  }
   array->elements=0;
   array->max_element=init_alloc;
   array->alloc_increment=alloc_increment;
   array->size_of_element=element_size;
   if ((array->buffer= init_buffer))
     DBUG_RETURN(FALSE);
-  if (!(array->buffer=(uchar*) my_malloc_ci(element_size*init_alloc,
+  if (init_alloc &&
+      !(array->buffer=(uchar*) my_malloc_ci(element_size*init_alloc,
                                             MYF(MY_WME))))
   {
     array->max_element=0;
     DBUG_RETURN(TRUE);
   }
   DBUG_RETURN(FALSE);
-} 
+}
 
 my_bool init_dynamic_array(DYNAMIC_ARRAY *array, uint element_size,
-                           uint init_alloc, 
+                           uint init_alloc,
                            uint alloc_increment CALLER_INFO_PROTO)
 {
   /* placeholder to preserve ABI */
-  return my_init_dynamic_array_ci(array, element_size, init_alloc, 
+  return my_init_dynamic_array_ci(array, element_size, init_alloc,
                                   alloc_increment);
 }
 /*
@@ -93,7 +88,7 @@ my_bool init_dynamic_array(DYNAMIC_ARRAY *array, uint element_size,
     FALSE	Ok
 */
 
-my_bool insert_dynamic(DYNAMIC_ARRAY *array, uchar* element)
+my_bool insert_dynamic(DYNAMIC_ARRAY *array, const uchar* element)
 {
   uchar* buffer;
   if (array->elements == array->max_element)
@@ -112,7 +107,7 @@ my_bool insert_dynamic(DYNAMIC_ARRAY *array, uchar* element)
 
 
 /*
-  Alloc space for next element(s) 
+  Alloc space for next element(s)
 
   SYNOPSIS
     alloc_dynamic()
@@ -130,6 +125,7 @@ my_bool insert_dynamic(DYNAMIC_ARRAY *array, uchar* element)
 
 uchar *alloc_dynamic(DYNAMIC_ARRAY *array)
 {
+  DBUG_ENTER("alloc_dynamic");
   if (array->elements == array->max_element)
   {
     char *new_ptr;
@@ -143,20 +139,20 @@ uchar *alloc_dynamic(DYNAMIC_ARRAY *array)
                                          array->alloc_increment) *
                                         array->size_of_element,
                                         MYF(MY_WME))))
-        return 0;
-      memcpy(new_ptr, array->buffer, 
+        DBUG_RETURN(0);
+      memcpy(new_ptr, array->buffer,
              array->elements * array->size_of_element);
     }
-    else
-    if (!(new_ptr=(char*) my_realloc(array->buffer,(array->max_element+
-                                     array->alloc_increment)*
-                                     array->size_of_element,
-                                     MYF(MY_WME | MY_ALLOW_ZERO_PTR))))
-      return 0;
+    else if (!(new_ptr=(char*)
+               my_realloc(array->buffer,(array->max_element+
+                                         array->alloc_increment)*
+                          array->size_of_element,
+                          MYF(MY_WME | MY_ALLOW_ZERO_PTR))))
+      DBUG_RETURN(0);
     array->buffer= (uchar*) new_ptr;
     array->max_element+=array->alloc_increment;
   }
-  return array->buffer+(array->elements++ * array->size_of_element);
+  DBUG_RETURN(array->buffer+(array->elements++ * array->size_of_element));
 }
 
 
@@ -166,8 +162,8 @@ uchar *alloc_dynamic(DYNAMIC_ARRAY *array)
   SYNOPSIS
     pop_dynamic()
       array
-  
-  RETURN VALUE    
+
+  RETURN VALUE
     pointer	Ok
     0		Array is empty
 */
@@ -189,9 +185,9 @@ uchar *pop_dynamic(DYNAMIC_ARRAY *array)
       idx	Index where element is to be inserted
 
   DESCRIPTION
-    set_dynamic() replaces element in array. 
-    If idx > max_element insert new element. Allocate memory if needed. 
- 
+    set_dynamic() replaces element in array.
+    If idx > max_element insert new element. Allocate memory if needed.
+
   RETURN VALUE
     TRUE	Idx was out of range and allocation of new memory failed
     FALSE	Ok
@@ -231,6 +227,8 @@ my_bool set_dynamic(DYNAMIC_ARRAY *array, uchar* element, uint idx)
 
 my_bool allocate_dynamic(DYNAMIC_ARRAY *array, uint max_elements)
 {
+  DBUG_ENTER("allocate_dynamic");
+
   if (max_elements >= array->max_element)
   {
     uint size;
@@ -244,23 +242,20 @@ my_bool allocate_dynamic(DYNAMIC_ARRAY *array, uint max_elements)
          so we have to create an all-new malloc since we overflowed
        */
        if (!(new_ptr= (uchar *) my_malloc(size *
-                                         array->size_of_element,
-                                         MYF(MY_WME))))
-         return 0;
-       memcpy(new_ptr, array->buffer, 
+                                          array->size_of_element,
+                                          MYF(MY_WME))))
+         DBUG_RETURN(0);
+       memcpy(new_ptr, array->buffer,
               array->elements * array->size_of_element);
      }
-     else
-
-
-    if (!(new_ptr= (uchar*) my_realloc(array->buffer,size*
-                                       array->size_of_element,
-                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR))))
-      return TRUE;
+    else if (!(new_ptr= (uchar*) my_realloc(array->buffer,size*
+                                            array->size_of_element,
+                                            MYF(MY_WME | MY_ALLOW_ZERO_PTR))))
+      DBUG_RETURN(TRUE);
     array->buffer= new_ptr;
     array->max_element= size;
   }
-  return FALSE;
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -269,9 +264,9 @@ my_bool allocate_dynamic(DYNAMIC_ARRAY *array, uint max_elements)
 
   SYNOPSIS
     get_dynamic()
-      array	
+      array
       uchar*	Element to be returned. If idx > elements contain zeroes.
-      idx	Index of element wanted. 
+      idx	Index of element wanted.
 */
 
 void get_dynamic(DYNAMIC_ARRAY *array, uchar* element, uint idx)
@@ -348,7 +343,7 @@ void freeze_size(DYNAMIC_ARRAY *array)
   */
   if (array->buffer == (uchar *)(array + 1))
     return;
-    
+
   if (array->buffer && array->max_element != elements)
   {
     array->buffer=(uchar*) my_realloc(array->buffer,
@@ -365,7 +360,7 @@ void freeze_size(DYNAMIC_ARRAY *array)
   SYNOPSIS
     get_index_dynamic()
      array	Array
-     element Whose element index 
+     element Whose element index
 
 */
 
diff --git a/mysys/checksum.c b/mysys/checksum.c
index 1c7c9358d53..1d264b54321 100644
--- a/mysys/checksum.c
+++ b/mysys/checksum.c
@@ -18,6 +18,8 @@
 #include <my_sys.h>
 #include <zlib.h>
 
+ha_checksum my_crc_dbug_check= 1;               /* Unlikely number */
+
 /*
   Calculate a long checksum for a memoryblock.
 
@@ -30,13 +32,9 @@
 
 ha_checksum my_checksum(ha_checksum crc, const uchar *pos, size_t length)
 {
-#ifdef NOT_USED
-  const uchar *end=pos+length;
-  for ( ; pos != end ; pos++)
-    crc=((crc << 8) + *((uchar*) pos)) + (crc >> (8*sizeof(ha_checksum)-8));
+  crc= (ha_checksum) crc32((uint)crc, pos, length);
+  DBUG_PRINT("info", ("crc: %lu", (ulong) crc));
+  if (crc == my_crc_dbug_check)
+    my_debug_put_break_here();
   return crc;
-#else
-  return (ha_checksum)crc32((uint)crc, pos, (uint)length);
-#endif
 }
-
diff --git a/mysys/errors.c b/mysys/errors.c
index 8d3303cac9f..d832ba37da3 100644
--- a/mysys/errors.c
+++ b/mysys/errors.c
@@ -49,7 +49,8 @@ const char * NEAR globerrs[GLOBERRS]=
   "Can't sync file '%s' to disk (Errcode: %d)",
   "Collation '%s' is not a compiled collation and is not specified in the '%s' file",
   "File '%s' not found (Errcode: %d)",
-  "File '%s' (fileno: %d) was not closed"
+  "File '%s' (fileno: %d) was not closed",
+  "Can't change mode for file '%s' to 0x%lx (Error: %d)"
 };
 
 void init_glob_errs(void)
@@ -90,6 +91,7 @@ void init_glob_errs()
   EE(EE_UNKNOWN_COLLATION)= "Collation '%s' is not a compiled collation and is not specified in the %s file";
   EE(EE_FILENOTFOUND)	= "File '%s' not found (Errcode: %d)";
   EE(EE_FILE_NOT_CLOSED) = "File '%s' (fileno: %d) was not closed";
+  EE(EE_CANT_CHMOD)    = "Can't change mode for file '%s' to 0x%lx (Error: %d)";
 }
 #endif
 
diff --git a/mysys/hash.c b/mysys/hash.c
index e7b5352af34..5443dedf7e0 100644
--- a/mysys/hash.c
+++ b/mysys/hash.c
@@ -304,7 +304,13 @@ static int hashcmp(const HASH *hash, HASH_LINK *pos, const uchar *key,
 }
 
 
-	/* Write a hash-key to the hash-index */
+/**
+   Write a hash-key to the hash-index
+
+   @return
+   @retval  0  ok
+   @retval  1  Duplicate key or out of memory
+*/
 
 my_bool my_hash_insert(HASH *info, const uchar *record)
 {
@@ -318,7 +324,7 @@ my_bool my_hash_insert(HASH *info, const uchar *record)
   LINT_INIT(ptr_to_rec);
   LINT_INIT(ptr_to_rec2);
 
-  if (HASH_UNIQUE & info->flags)
+  if (info->flags & HASH_UNIQUE)
   {
     uchar *key= (uchar*) my_hash_key(info, record, &idx, 1);
     if (my_hash_search(info, key, idx))
@@ -442,11 +448,21 @@ my_bool my_hash_insert(HASH *info, const uchar *record)
 }
 
 
-/******************************************************************************
-** Remove one record from hash-table. The record with the same record
-** ptr is removed.
-** if there is a free-function it's called for record if found
-******************************************************************************/
+/**
+   Remove one record from hash-table.
+
+   @fn    hash_delete()
+   @param hash		Hash tree
+   @param record	Row to be deleted
+
+   @notes
+   The record with the same record ptr is removed.
+   If there is a free-function it's called if record was found.
+
+   @return
+   @retval  0  ok
+   @retval  1 Record not found
+*/
 
 my_bool my_hash_delete(HASH *hash, uchar *record)
 {
@@ -530,10 +546,11 @@ exit:
   DBUG_RETURN(0);
 }
 
-	/*
-	  Update keys when record has changed.
-	  This is much more efficent than using a delete & insert.
-	  */
+
+/**
+   Update keys when record has changed.
+   This is much more efficent than using a delete & insert.
+*/
 
 my_bool my_hash_update(HASH *hash, uchar *record, uchar *old_key,
                        size_t old_key_length)
@@ -656,6 +673,37 @@ void my_hash_replace(HASH *hash, HASH_SEARCH_STATE *current_record,
 }
 
 
+/**
+   Iterate over all elements in hash and call function with the element
+
+   @param hash     hash array
+   @param action   function to call for each argument
+   @param argument second argument for call to action
+
+   @notes
+   If one of functions calls returns 1 then the iteration aborts
+
+   @retval 0  ok
+   @retval 1  iteration aborted becasue action returned 1
+*/
+
+my_bool my_hash_iterate(HASH *hash, my_hash_walk_action action, void *argument)
+{
+  uint records, i;
+  HASH_LINK *data;
+
+  records= hash->records;
+  data= dynamic_element(&hash->array,0,HASH_LINK*);
+
+  for (i= 0 ; i < records ; i++)
+  {
+    if ((*action)(data[i].data, argument))
+      return 1;
+  }
+  return 0;
+}
+
+
 #ifndef DBUG_OFF
 
 my_bool my_hash_check(HASH *hash)
diff --git a/mysys/lf_alloc-pin.c b/mysys/lf_alloc-pin.c
new file mode 100644
index 00000000000..0293bfc6faf
--- /dev/null
+++ b/mysys/lf_alloc-pin.c
@@ -0,0 +1,535 @@
+/* QQ: TODO multi-pinbox */
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  wait-free concurrent allocator based on pinning addresses
+
+  It works as follows: every thread (strictly speaking - every CPU, but
+  it's too difficult to do) has a small array of pointers. They're called
+  "pins".  Before using an object its address must be stored in this array
+  (pinned).  When an object is no longer necessary its address must be
+  removed from this array (unpinned). When a thread wants to free() an
+  object it scans all pins of all threads to see if somebody has this
+  object pinned.  If yes - the object is not freed (but stored in a
+  "purgatory").  To reduce the cost of a single free() pins are not scanned
+  on every free() but only added to (thread-local) purgatory. On every
+  LF_PURGATORY_SIZE free() purgatory is scanned and all unpinned objects
+  are freed.
+
+  Pins are used to solve ABA problem. To use pins one must obey
+  a pinning protocol:
+
+   1. Let's assume that PTR is a shared pointer to an object. Shared means
+      that any thread may modify it anytime to point to a different object
+      and free the old object. Later the freed object may be potentially
+      allocated by another thread. If we're unlucky that other thread may
+      set PTR to point to this object again. This is ABA problem.
+   2. Create a local pointer LOCAL_PTR.
+   3. Pin the PTR in a loop:
+      do
+      {
+        LOCAL_PTR= PTR;
+        pin(PTR, PIN_NUMBER);
+      } while (LOCAL_PTR != PTR)
+   4. It is guaranteed that after the loop has ended, LOCAL_PTR
+      points to an object (or NULL, if PTR may be NULL), that
+      will never be freed. It is not guaranteed though
+      that LOCAL_PTR == PTR (as PTR can change any time)
+   5. When done working with the object, remove the pin:
+      unpin(PIN_NUMBER)
+   6. When copying pins (as in the list traversing loop:
+        pin(CUR, 1);
+        while ()
+        {
+          do                            // standard
+          {                             //  pinning
+            NEXT=CUR->next;             //   loop
+            pin(NEXT, 0);               //    see #3
+          } while (NEXT != CUR->next);  //     above
+          ...
+          ...
+          CUR=NEXT;
+          pin(CUR, 1);                  // copy pin[0] to pin[1]
+        }
+      which keeps CUR address constantly pinned), note than pins may be
+      copied only upwards (!!!), that is pin[N] to pin[M], M > N.
+   7. Don't keep the object pinned longer than necessary - the number of
+      pins you have is limited (and small), keeping an object pinned
+      prevents its reuse and cause unnecessary mallocs.
+
+  Explanations:
+
+   3. The loop is important. The following can occur:
+        thread1> LOCAL_PTR= PTR
+        thread2> free(PTR); PTR=0;
+        thread1> pin(PTR, PIN_NUMBER);
+      now thread1 cannot access LOCAL_PTR, even if it's pinned,
+      because it points to a freed memory. That is, it *must*
+      verify that it has indeed pinned PTR, the shared pointer.
+
+   6. When a thread wants to free some LOCAL_PTR, and it scans
+      all lists of pins to see whether it's pinned, it does it
+      upwards, from low pin numbers to high. Thus another thread
+      must copy an address from one pin to another in the same
+      direction - upwards, otherwise the scanning thread may
+      miss it.
+
+  Implementation details:
+
+  Pins are given away from a "pinbox". Pinbox is stack-based allocator.
+  It used dynarray for storing pins, new elements are allocated by dynarray
+  as necessary, old are pushed in the stack for reuse. ABA is solved by
+  versioning a pointer - because we use an array, a pointer to pins is 16 bit,
+  upper 16 bits are used for a version.
+
+  It is assumed that pins belong to a THD and are not transferable
+  between THD's (LF_PINS::stack_ends_here being a primary reason
+  for this limitation).
+*/
+#include <my_global.h>
+#include <my_sys.h>
+#include <lf.h>
+
+#define LF_PINBOX_MAX_PINS 65536
+
+static void _lf_pinbox_real_free(LF_PINS *pins);
+
+/*
+  Initialize a pinbox. Normally called from lf_alloc_init.
+  See the latter for details.
+*/
+void lf_pinbox_init(LF_PINBOX *pinbox, uint free_ptr_offset,
+                    lf_pinbox_free_func *free_func, void *free_func_arg)
+{
+  DBUG_ASSERT(free_ptr_offset % sizeof(void *) == 0);
+  compile_time_assert(sizeof(LF_PINS) == 128);
+  lf_dynarray_init(&pinbox->pinarray, sizeof(LF_PINS));
+  pinbox->pinstack_top_ver= 0;
+  pinbox->pins_in_array= 0;
+  pinbox->free_ptr_offset= free_ptr_offset;
+  pinbox->free_func= free_func;
+  pinbox->free_func_arg= free_func_arg;
+}
+
+void lf_pinbox_destroy(LF_PINBOX *pinbox)
+{
+  lf_dynarray_destroy(&pinbox->pinarray);
+}
+
+/*
+  Get pins from a pinbox. Usually called via lf_alloc_get_pins() or
+  lf_hash_get_pins().
+
+  SYNOPSYS
+    pinbox      -
+
+  DESCRIPTION
+    get a new LF_PINS structure from a stack of unused pins,
+    or allocate a new one out of dynarray.
+
+  NOTE
+    It is assumed that pins belong to a thread and are not transferable
+    between threads.
+*/
+LF_PINS *_lf_pinbox_get_pins(LF_PINBOX *pinbox)
+{
+  uint32 pins, next, top_ver;
+  LF_PINS *el;
+  /*
+    We have an array of max. 64k elements.
+    The highest index currently allocated is pinbox->pins_in_array.
+    Freed elements are in a lifo stack, pinstack_top_ver.
+    pinstack_top_ver is 32 bits; 16 low bits are the index in the
+    array, to the first element of the list. 16 high bits are a version
+    (every time the 16 low bits are updated, the 16 high bits are
+    incremented). Versioniong prevents the ABA problem.
+  */
+  top_ver= pinbox->pinstack_top_ver;
+  do
+  {
+    if (!(pins= top_ver % LF_PINBOX_MAX_PINS))
+    {
+      /* the stack of free elements is empty */
+      pins= my_atomic_add32((int32 volatile*) &pinbox->pins_in_array, 1)+1;
+      if (unlikely(pins >= LF_PINBOX_MAX_PINS))
+        return 0;
+      /*
+        note that the first allocated element has index 1 (pins==1).
+        index 0 is reserved to mean "NULL pointer"
+      */
+      el= (LF_PINS *)_lf_dynarray_lvalue(&pinbox->pinarray, pins);
+      if (unlikely(!el))
+        return 0;
+      break;
+    }
+    el= (LF_PINS *)_lf_dynarray_value(&pinbox->pinarray, pins);
+    next= el->link;
+  } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver,
+                            (int32*) &top_ver,
+                            top_ver-pins+next+LF_PINBOX_MAX_PINS));
+  /*
+    set el->link to the index of el in the dynarray (el->link has two usages:
+    - if element is allocated, it's its own index
+    - if element is free, it's its next element in the free stack
+  */
+  el->link= pins;
+  el->purgatory_count= 0;
+  el->pinbox= pinbox;
+  el->stack_ends_here= & my_thread_var->stack_ends_here;
+  return el;
+}
+
+/*
+  Put pins back to a pinbox. Usually called via lf_alloc_put_pins() or
+  lf_hash_put_pins().
+
+  DESCRIPTION
+    empty the purgatory (XXX deadlock warning below!),
+    push LF_PINS structure to a stack
+*/
+void _lf_pinbox_put_pins(LF_PINS *pins)
+{
+  LF_PINBOX *pinbox= pins->pinbox;
+  uint32 top_ver, nr;
+  nr= pins->link;
+#ifdef MY_LF_EXTRA_DEBUG
+  {
+    int i;
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+      DBUG_ASSERT(pins->pin[i] == 0);
+  }
+#endif
+  /*
+    XXX this will deadlock if other threads will wait for
+    the caller to do something after _lf_pinbox_put_pins(),
+    and they would have pinned addresses that the caller wants to free.
+    Thus: only free pins when all work is done and nobody can wait for you!!!
+  */
+  while (pins->purgatory_count)
+  {
+    _lf_pinbox_real_free(pins);
+    if (pins->purgatory_count)
+    {
+      my_atomic_rwlock_wrunlock(&pins->pinbox->pinarray.lock);
+      pthread_yield();
+      my_atomic_rwlock_wrlock(&pins->pinbox->pinarray.lock);
+    }
+  }
+  top_ver= pinbox->pinstack_top_ver;
+  do
+  {
+    pins->link= top_ver % LF_PINBOX_MAX_PINS;
+  } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver,
+                            (int32*) &top_ver,
+                            top_ver-pins->link+nr+LF_PINBOX_MAX_PINS));
+  return;
+}
+
+static int ptr_cmp(void **a, void **b)
+{
+  return *a < *b ? -1 : *a == *b ? 0 : 1;
+}
+
+#define add_to_purgatory(PINS, ADDR)                                    \
+  do                                                                    \
+  {                                                                     \
+    *(void **)((char *)(ADDR)+(PINS)->pinbox->free_ptr_offset)=         \
+      (PINS)->purgatory;                                                \
+    (PINS)->purgatory= (ADDR);                                          \
+    (PINS)->purgatory_count++;                                          \
+  } while (0)
+
+/*
+  Free an object allocated via pinbox allocator
+
+  DESCRIPTION
+    add an object to purgatory. if necessary, call _lf_pinbox_real_free()
+    to actually free something.
+*/
+void _lf_pinbox_free(LF_PINS *pins, void *addr)
+{
+  add_to_purgatory(pins, addr);
+  if (pins->purgatory_count % LF_PURGATORY_SIZE)
+    _lf_pinbox_real_free(pins);
+}
+
+struct st_harvester {
+  void **granary;
+  int npins;
+};
+
+/*
+  callback for _lf_dynarray_iterate:
+  scan all pins of all threads and accumulate all pins
+*/
+static int harvest_pins(LF_PINS *el, struct st_harvester *hv)
+{
+  int i;
+  LF_PINS *el_end= el+min(hv->npins, LF_DYNARRAY_LEVEL_LENGTH);
+  for (; el < el_end; el++)
+  {
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+    {
+      void *p= el->pin[i];
+      if (p)
+        *hv->granary++= p;
+    }
+  }
+  /*
+    hv->npins may become negative below, but it means that
+    we're on the last dynarray page and harvest_pins() won't be
+    called again. We don't bother to make hv->npins() correct
+    (that is 0) in this case.
+  */
+  hv->npins-= LF_DYNARRAY_LEVEL_LENGTH;
+  return 0;
+}
+
+/*
+  callback for _lf_dynarray_iterate:
+  scan all pins of all threads and see if addr is present there
+*/
+static int match_pins(LF_PINS *el, void *addr)
+{
+  int i;
+  LF_PINS *el_end= el+LF_DYNARRAY_LEVEL_LENGTH;
+  for (; el < el_end; el++)
+    for (i= 0; i < LF_PINBOX_PINS; i++)
+      if (el->pin[i] == addr)
+        return 1;
+  return 0;
+}
+
+#if STACK_DIRECTION < 0
+#define available_stack_size(CUR,END) (long) ((char*)(CUR) - (char*)(END))
+#else
+#define available_stack_size(CUR,END) (long) ((char*)(END) - (char*)(CUR))
+#endif
+
+#define next_node(P, X) (*((uchar * volatile *)(((uchar *)(X)) + (P)->free_ptr_offset)))
+#define anext_node(X) next_node(&allocator->pinbox, (X))
+
+/*
+  Scan the purgatory and free everything that can be freed
+*/
+static void _lf_pinbox_real_free(LF_PINS *pins)
+{
+  int npins, alloca_size;
+  void *list, **addr;
+  void *first, *last= NULL;
+  LF_PINBOX *pinbox= pins->pinbox;
+
+  LINT_INIT(first);
+  npins= pinbox->pins_in_array+1;
+
+#ifdef HAVE_ALLOCA
+  alloca_size= sizeof(void *)*LF_PINBOX_PINS*npins;
+  /* create a sorted list of pinned addresses, to speed up searches */
+  if (available_stack_size(&pinbox, *pins->stack_ends_here) > alloca_size)
+  {
+    struct st_harvester hv;
+    addr= (void **) alloca(alloca_size);
+    hv.granary= addr;
+    hv.npins= npins;
+    /* scan the dynarray and accumulate all pinned addresses */
+    _lf_dynarray_iterate(&pinbox->pinarray,
+                         (lf_dynarray_func)harvest_pins, &hv);
+
+    npins= hv.granary-addr;
+    /* and sort them */
+    if (npins)
+      qsort(addr, npins, sizeof(void *), (qsort_cmp)ptr_cmp);
+  }
+  else
+#endif
+    addr= 0;
+
+  list= pins->purgatory;
+  pins->purgatory= 0;
+  pins->purgatory_count= 0;
+  while (list)
+  {
+    void *cur= list;
+    list= *(void **)((char *)cur+pinbox->free_ptr_offset);
+    if (npins)
+    {
+      if (addr) /* use binary search */
+      {
+        void **a, **b, **c;
+        for (a= addr, b= addr+npins-1, c= a+(b-a)/2; (b-a) > 1; c= a+(b-a)/2)
+          if (cur == *c)
+            a= b= c;
+          else if (cur > *c)
+            a= c;
+          else
+            b= c;
+        if (cur == *a || cur == *b)
+          goto found;
+      }
+      else /* no alloca - no cookie. linear search here */
+      {
+        if (_lf_dynarray_iterate(&pinbox->pinarray,
+                                 (lf_dynarray_func)match_pins, cur))
+          goto found;
+      }
+    }
+    /* not pinned - freeing */
+    if (last)
+      last= next_node(pinbox, last)= (uchar *)cur;
+    else
+      first= last= (uchar *)cur;
+    continue;
+found:
+    /* pinned - keeping */
+    add_to_purgatory(pins, cur);
+  }
+  if (last)
+    pinbox->free_func(first, last, pinbox->free_func_arg);
+}
+
+/* lock-free memory allocator for fixed-size objects */
+
+LF_REQUIRE_PINS(1)
+
+/*
+  callback for _lf_pinbox_real_free to free a list of unpinned objects -
+  add it back to the allocator stack
+
+  DESCRIPTION
+    'first' and 'last' are the ends of the linked list of nodes:
+    first->el->el->....->el->last. Use first==last to free only one element.
+*/
+static void alloc_free(uchar *first,
+                       uchar volatile *last,
+                       LF_ALLOCATOR *allocator)
+{
+  /*
+    we need a union here to access type-punned pointer reliably.
+    otherwise gcc -fstrict-aliasing will not see 'tmp' changed in the loop
+  */
+  union { uchar * node; void *ptr; } tmp;
+  tmp.node= allocator->top;
+  do
+  {
+    anext_node(last)= tmp.node;
+  } while (!my_atomic_casptr((void **)(char *)&allocator->top,
+                             (void **)&tmp.ptr, first) && LF_BACKOFF);
+}
+
+/*
+  initialize lock-free allocator
+
+  SYNOPSYS
+    allocator           -
+    size                a size of an object to allocate
+    free_ptr_offset     an offset inside the object to a sizeof(void *)
+                        memory that is guaranteed to be unused after
+                        the object is put in the purgatory. Unused by ANY
+                        thread, not only the purgatory owner.
+                        This memory will be used to link waiting-to-be-freed
+                        objects in a purgatory list.
+*/
+void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset)
+{
+  lf_pinbox_init(&allocator->pinbox, free_ptr_offset,
+                 (lf_pinbox_free_func *)alloc_free, allocator);
+  allocator->top= 0;
+  allocator->mallocs= 0;
+  allocator->element_size= size;
+  allocator->constructor= 0;
+  allocator->destructor= 0;
+  DBUG_ASSERT(size >= sizeof(void*) + free_ptr_offset);
+}
+
+/*
+  destroy the allocator, free everything that's in it
+
+  NOTE
+    As every other init/destroy function here and elsewhere it
+    is not thread safe. No, this function is no different, ensure
+    that no thread needs the allocator before destroying it.
+    We are not responsible for any damage that may be caused by
+    accessing the allocator when it is being or has been destroyed.
+    Oh yes, and don't put your cat in a microwave.
+*/
+void lf_alloc_destroy(LF_ALLOCATOR *allocator)
+{
+  uchar *node= allocator->top;
+  while (node)
+  {
+    uchar *tmp= anext_node(node);
+    if (allocator->destructor)
+      allocator->destructor(node);
+    my_free((void *)node, MYF(0));
+    node= tmp;
+  }
+  lf_pinbox_destroy(&allocator->pinbox);
+  allocator->top= 0;
+}
+
+/*
+  Allocate and return an new object.
+
+  DESCRIPTION
+    Pop an unused object from the stack or malloc it is the stack is empty.
+    pin[0] is used, it's removed on return.
+*/
+void *_lf_alloc_new(LF_PINS *pins)
+{
+  LF_ALLOCATOR *allocator= (LF_ALLOCATOR *)(pins->pinbox->free_func_arg);
+  uchar *node;
+  for (;;)
+  {
+    do
+    {
+      node= allocator->top;
+      _lf_pin(pins, 0, node);
+    } while (node != allocator->top && LF_BACKOFF);
+    if (!node)
+    {
+      node= (void *)my_malloc(allocator->element_size, MYF(MY_WME));
+      if (allocator->constructor)
+        allocator->constructor(node);
+#ifdef MY_LF_EXTRA_DEBUG
+      if (likely(node != 0))
+        my_atomic_add32(&allocator->mallocs, 1);
+#endif
+      break;
+    }
+    if (my_atomic_casptr((void **)(char *)&allocator->top,
+                         (void *)&node, anext_node(node)))
+      break;
+  }
+  _lf_unpin(pins, 0);
+  return node;
+}
+
+/*
+  count the number of objects in a pool.
+
+  NOTE
+    This is NOT thread-safe !!!
+*/
+uint lf_alloc_pool_count(LF_ALLOCATOR *allocator)
+{
+  uint i;
+  uchar *node;
+  for (node= allocator->top, i= 0; node; node= anext_node(node), i++)
+    /* no op */;
+  return i;
+}
+
diff --git a/mysys/lf_dynarray.c b/mysys/lf_dynarray.c
new file mode 100644
index 00000000000..7c8f54f07cf
--- /dev/null
+++ b/mysys/lf_dynarray.c
@@ -0,0 +1,208 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Analog of DYNAMIC_ARRAY that never reallocs
+  (so no pointer into the array may ever become invalid).
+
+  Memory is allocated in non-contiguous chunks.
+  This data structure is not space efficient for sparse arrays.
+
+  Every element is aligned to sizeof(element) boundary
+  (to avoid false sharing if element is big enough).
+
+  LF_DYNARRAY is a recursive structure. On the zero level
+  LF_DYNARRAY::level[0] it's an array of LF_DYNARRAY_LEVEL_LENGTH elements,
+  on the first level it's an array of LF_DYNARRAY_LEVEL_LENGTH pointers
+  to arrays of elements, on the second level it's an array of pointers
+  to arrays of pointers to arrays of elements. And so on.
+
+  With four levels the number of elements is limited to 4311810304
+  (but as in all functions index is uint, the real limit is 2^32-1)
+
+  Actually, it's wait-free, not lock-free ;-)
+*/
+
+#include <my_global.h>
+#include <m_string.h>
+#include <my_sys.h>
+#include <lf.h>
+
+void lf_dynarray_init(LF_DYNARRAY *array, uint element_size)
+{
+  bzero(array, sizeof(*array));
+  array->size_of_element= element_size;
+  my_atomic_rwlock_init(&array->lock);
+}
+
+static void recursive_free(void **alloc, int level)
+{
+  if (!alloc)
+    return;
+
+  if (level)
+  {
+    int i;
+    for (i= 0; i < LF_DYNARRAY_LEVEL_LENGTH; i++)
+      recursive_free(alloc[i], level-1);
+    my_free((void *)alloc, MYF(0));
+  }
+  else
+    my_free(alloc[-1], MYF(0));
+}
+
+void lf_dynarray_destroy(LF_DYNARRAY *array)
+{
+  int i;
+  for (i= 0; i < LF_DYNARRAY_LEVELS; i++)
+    recursive_free(array->level[i], i);
+  my_atomic_rwlock_destroy(&array->lock);
+}
+
+static const ulong dynarray_idxes_in_prev_levels[LF_DYNARRAY_LEVELS]=
+{
+  0, /* +1 here to to avoid -1's below */
+  LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH +
+    LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH + LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH + LF_DYNARRAY_LEVEL_LENGTH
+};
+
+static const ulong dynarray_idxes_in_prev_level[LF_DYNARRAY_LEVELS]=
+{
+  0, /* +1 here to to avoid -1's below */
+  LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH,
+  LF_DYNARRAY_LEVEL_LENGTH * LF_DYNARRAY_LEVEL_LENGTH *
+    LF_DYNARRAY_LEVEL_LENGTH,
+};
+
+/*
+  Returns a valid lvalue pointer to the element number 'idx'.
+  Allocates memory if necessary.
+*/
+void *_lf_dynarray_lvalue(LF_DYNARRAY *array, uint idx)
+{
+  void * ptr, * volatile * ptr_ptr= 0;
+  int i;
+
+  for (i= LF_DYNARRAY_LEVELS-1; idx < dynarray_idxes_in_prev_levels[i]; i--)
+    /* no-op */;
+  ptr_ptr= &array->level[i];
+  idx-= dynarray_idxes_in_prev_levels[i];
+  for (; i > 0; i--)
+  {
+    if (!(ptr= *ptr_ptr))
+    {
+      void *alloc= my_malloc(LF_DYNARRAY_LEVEL_LENGTH * sizeof(void *),
+                             MYF(MY_WME|MY_ZEROFILL));
+      if (unlikely(!alloc))
+        return(NULL);
+      if (my_atomic_casptr(ptr_ptr, &ptr, alloc))
+        ptr= alloc;
+      else
+        my_free(alloc, MYF(0));
+    }
+    ptr_ptr= ((void **)ptr) + idx / dynarray_idxes_in_prev_level[i];
+    idx%= dynarray_idxes_in_prev_level[i];
+  }
+  if (!(ptr= *ptr_ptr))
+  {
+    uchar *alloc, *data;
+    alloc= my_malloc(LF_DYNARRAY_LEVEL_LENGTH * array->size_of_element +
+                    max(array->size_of_element, sizeof(void *)),
+                    MYF(MY_WME|MY_ZEROFILL));
+    if (unlikely(!alloc))
+      return(NULL);
+    /* reserve the space for free() address */
+    data= alloc + sizeof(void *);
+    { /* alignment */
+      intptr mod= ((intptr)data) % array->size_of_element;
+      if (mod)
+        data+= array->size_of_element - mod;
+    }
+    ((void **)data)[-1]= alloc; /* free() will need the original pointer */
+    if (my_atomic_casptr(ptr_ptr, &ptr, data))
+      ptr= data;
+    else
+      my_free(alloc, MYF(0));
+  }
+  return ((uchar*)ptr) + array->size_of_element * idx;
+}
+
+/*
+  Returns a pointer to the element number 'idx'
+  or NULL if an element does not exists
+*/
+void *_lf_dynarray_value(LF_DYNARRAY *array, uint idx)
+{
+  void * ptr, * volatile * ptr_ptr= 0;
+  int i;
+
+  for (i= LF_DYNARRAY_LEVELS-1; idx < dynarray_idxes_in_prev_levels[i]; i--)
+    /* no-op */;
+  ptr_ptr= &array->level[i];
+  idx-= dynarray_idxes_in_prev_levels[i];
+  for (; i > 0; i--)
+  {
+    if (!(ptr= *ptr_ptr))
+      return(NULL);
+    ptr_ptr= ((void **)ptr) + idx / dynarray_idxes_in_prev_level[i];
+    idx %= dynarray_idxes_in_prev_level[i];
+  }
+  if (!(ptr= *ptr_ptr))
+    return(NULL);
+  return ((uchar*)ptr) + array->size_of_element * idx;
+}
+
+static int recursive_iterate(LF_DYNARRAY *array, void *ptr, int level,
+                             lf_dynarray_func func, void *arg)
+{
+  int res, i;
+  if (!ptr)
+    return 0;
+  if (!level)
+    return func(ptr, arg);
+  for (i= 0; i < LF_DYNARRAY_LEVEL_LENGTH; i++)
+    if ((res= recursive_iterate(array, ((void **)ptr)[i], level-1, func, arg)))
+      return res;
+  return 0;
+}
+
+/*
+  Calls func(array, arg) on every array of LF_DYNARRAY_LEVEL_LENGTH elements
+  in lf_dynarray.
+
+  DESCRIPTION
+    lf_dynarray consists of a set of arrays, LF_DYNARRAY_LEVEL_LENGTH elements
+    each. _lf_dynarray_iterate() calls user-supplied function on every array
+    from the set. It is the fastest way to scan the array, faster than
+      for (i=0; i < N; i++) { func(_lf_dynarray_value(dynarray, i)); }
+
+  NOTE
+    if func() returns non-zero, the scan is aborted
+*/
+int _lf_dynarray_iterate(LF_DYNARRAY *array, lf_dynarray_func func, void *arg)
+{
+  int i, res;
+  for (i= 0; i < LF_DYNARRAY_LEVELS; i++)
+    if ((res= recursive_iterate(array, array->level[i], i, func, arg)))
+      return res;
+  return 0;
+}
+
diff --git a/mysys/lf_hash.c b/mysys/lf_hash.c
new file mode 100644
index 00000000000..ce7056af995
--- /dev/null
+++ b/mysys/lf_hash.c
@@ -0,0 +1,506 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  extensible hash
+
+  TODO
+     try to get rid of dummy nodes ?
+     for non-unique hash, count only _distinct_ values
+     (but how to do it in lf_hash_delete ?)
+*/
+#include <my_global.h>
+#include <m_string.h>
+#include <my_sys.h>
+#include <my_bit.h>
+#include <lf.h>
+
+LF_REQUIRE_PINS(3)
+
+/* An element of the list */
+typedef struct {
+  intptr volatile link; /* a pointer to the next element in a listand a flag */
+  uint32 hashnr;        /* reversed hash number, for sorting                 */
+  const uchar *key;
+  size_t keylen;
+  /*
+    data is stored here, directly after the keylen.
+    thus the pointer to data is (void*)(slist_element_ptr+1)
+  */
+} LF_SLIST;
+
+const int LF_HASH_OVERHEAD= sizeof(LF_SLIST);
+
+/*
+  a structure to pass the context (pointers two the three successive elements
+  in a list) from lfind to linsert/ldelete
+*/
+typedef struct {
+  intptr volatile *prev;
+  LF_SLIST *curr, *next;
+} CURSOR;
+
+/*
+  the last bit in LF_SLIST::link is a "deleted" flag.
+  the helper macros below convert it to a pure pointer or a pure flag
+*/
+#define PTR(V)      (LF_SLIST *)((V) & (~(intptr)1))
+#define DELETED(V)  ((V) & 1)
+
+/*
+  DESCRIPTION
+    Search for hashnr/key/keylen in the list starting from 'head' and
+    position the cursor. The list is ORDER BY hashnr, key
+
+  RETURN
+    0 - not found
+    1 - found
+
+  NOTE
+    cursor is positioned in either case
+    pins[0..2] are used, they are NOT removed on return
+*/
+static int lfind(LF_SLIST * volatile *head, CHARSET_INFO *cs, uint32 hashnr,
+                 const uchar *key, uint keylen, CURSOR *cursor, LF_PINS *pins)
+{
+  uint32       cur_hashnr;
+  const uchar  *cur_key;
+  uint         cur_keylen;
+  intptr       link;
+
+retry:
+  cursor->prev= (intptr *)head;
+  do { /* PTR() isn't necessary below, head is a dummy node */
+    cursor->curr= (LF_SLIST *)(*cursor->prev);
+    _lf_pin(pins, 1, cursor->curr);
+  } while (*cursor->prev != (intptr)cursor->curr && LF_BACKOFF);
+  for (;;)
+  {
+    if (unlikely(!cursor->curr))
+      return 0; /* end of the list */
+    do {
+      /* QQ: XXX or goto retry ? */
+      link= cursor->curr->link;
+      cursor->next= PTR(link);
+      _lf_pin(pins, 0, cursor->next);
+    } while (link != cursor->curr->link && LF_BACKOFF);
+    cur_hashnr= cursor->curr->hashnr;
+    cur_key= cursor->curr->key;
+    cur_keylen= cursor->curr->keylen;
+    if (*cursor->prev != (intptr)cursor->curr)
+    {
+      (void)LF_BACKOFF;
+      goto retry;
+    }
+    if (!DELETED(link))
+    {
+      if (cur_hashnr >= hashnr)
+      {
+        int r= 1;
+        if (cur_hashnr > hashnr ||
+            (r= my_strnncoll(cs, (uchar*) cur_key, cur_keylen, (uchar*) key,
+                             keylen)) >= 0)
+          return !r;
+      }
+      cursor->prev= &(cursor->curr->link);
+      _lf_pin(pins, 2, cursor->curr);
+    }
+    else
+    {
+      /*
+        we found a deleted node - be nice, help the other thread
+        and remove this deleted node
+      */
+      if (my_atomic_casptr((void **)cursor->prev,
+                           (void **)&cursor->curr, cursor->next))
+        _lf_alloc_free(pins, cursor->curr);
+      else
+      {
+        (void)LF_BACKOFF;
+        goto retry;
+      }
+    }
+    cursor->curr= cursor->next;
+    _lf_pin(pins, 1, cursor->curr);
+  }
+}
+
+/*
+  DESCRIPTION
+    insert a 'node' in the list that starts from 'head' in the correct
+    position (as found by lfind)
+
+  RETURN
+    0     - inserted
+    not 0 - a pointer to a duplicate (not pinned and thus unusable)
+
+  NOTE
+    it uses pins[0..2], on return all pins are removed.
+    if there're nodes with the same key value, a new node is added before them.
+*/
+static LF_SLIST *linsert(LF_SLIST * volatile *head, CHARSET_INFO *cs,
+                         LF_SLIST *node, LF_PINS *pins, uint flags)
+{
+  CURSOR         cursor;
+  int            res;
+
+  for (;;)
+  {
+    if (lfind(head, cs, node->hashnr, node->key, node->keylen,
+              &cursor, pins) &&
+        (flags & LF_HASH_UNIQUE))
+    {
+      res= 0; /* duplicate found */
+      break;
+    }
+    else
+    {
+      node->link= (intptr)cursor.curr;
+      DBUG_ASSERT(node->link != (intptr)node); /* no circular references */
+      DBUG_ASSERT(cursor.prev != &node->link); /* no circular references */
+      if (my_atomic_casptr((void **)cursor.prev, (void **)&cursor.curr, node))
+      {
+        res= 1; /* inserted ok */
+        break;
+      }
+    }
+  }
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  /*
+    Note that cursor.curr is not pinned here and the pointer is unreliable,
+    the object may dissapear anytime. But if it points to a dummy node, the
+    pointer is safe, because dummy nodes are never freed - initialize_bucket()
+    uses this fact.
+  */
+  return res ? 0 : cursor.curr;
+}
+
+/*
+  DESCRIPTION
+    deletes a node as identified by hashnr/keey/keylen from the list
+    that starts from 'head'
+
+  RETURN
+    0 - ok
+    1 - not found
+
+  NOTE
+    it uses pins[0..2], on return all pins are removed.
+*/
+static int ldelete(LF_SLIST * volatile *head, CHARSET_INFO *cs, uint32 hashnr,
+                   const uchar *key, uint keylen, LF_PINS *pins)
+{
+  CURSOR cursor;
+  int res;
+
+  for (;;)
+  {
+    if (!lfind(head, cs, hashnr, key, keylen, &cursor, pins))
+    {
+      res= 1; /* not found */
+      break;
+    }
+    else
+    {
+      /* mark the node deleted */
+      if (my_atomic_casptr((void **)&(cursor.curr->link),
+                           (void **)&cursor.next,
+                           (void *)(((intptr)cursor.next) | 1)))
+      {
+        /* and remove it from the list */
+        if (my_atomic_casptr((void **)cursor.prev,
+                             (void **)&cursor.curr, cursor.next))
+          _lf_alloc_free(pins, cursor.curr);
+        else
+        {
+          /*
+            somebody already "helped" us and removed the node ?
+            Let's check if we need to help that someone too!
+            (to ensure the number of "set DELETED flag" actions
+            is equal to the number of "remove from the list" actions)
+          */
+          lfind(head, cs, hashnr, key, keylen, &cursor, pins);
+        }
+        res= 0;
+        break;
+      }
+    }
+  }
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  return res;
+}
+
+/*
+  DESCRIPTION
+    searches for a node as identified by hashnr/keey/keylen in the list
+    that starts from 'head'
+
+  RETURN
+    0 - not found
+    node - found
+
+  NOTE
+    it uses pins[0..2], on return the pin[2] keeps the node found
+    all other pins are removed.
+*/
+static LF_SLIST *lsearch(LF_SLIST * volatile *head, CHARSET_INFO *cs,
+                         uint32 hashnr, const uchar *key, uint keylen,
+                         LF_PINS *pins)
+{
+  CURSOR cursor;
+  int res= lfind(head, cs, hashnr, key, keylen, &cursor, pins);
+  if (res)
+    _lf_pin(pins, 2, cursor.curr);
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  return res ? cursor.curr : 0;
+}
+
+static inline const uchar* hash_key(const LF_HASH *hash,
+                                    const uchar *record, size_t *length)
+{
+  if (hash->get_key)
+    return (*hash->get_key)(record, length, 0);
+  *length= hash->key_length;
+  return record + hash->key_offset;
+}
+
+/*
+  Compute the hash key value from the raw key.
+
+  @note, that the hash value is limited to 2^31, because we need one
+  bit to distinguish between normal and dummy nodes.
+*/
+static inline uint calc_hash(LF_HASH *hash, const uchar *key, uint keylen)
+{
+  ulong nr1= 1, nr2= 4;
+  hash->charset->coll->hash_sort(hash->charset, (uchar*) key, keylen,
+                                 &nr1, &nr2);
+  return nr1 & INT_MAX32;
+}
+
+#define MAX_LOAD 1.0    /* average number of elements in a bucket */
+
+static int initialize_bucket(LF_HASH *, LF_SLIST * volatile*, uint, LF_PINS *);
+
+/*
+  Initializes lf_hash, the arguments are compatible with hash_init
+
+  @note element_size sets both the size of allocated memory block for
+  lf_alloc and a size of memcpy'ed block size in lf_hash_insert. Typically
+  they are the same, indeed. But LF_HASH::element_size can be decreased
+  after lf_hash_init, and then lf_alloc will allocate larger block that
+  lf_hash_insert will copy over. It is desireable if part of the element
+  is expensive to initialize - for example if there is a mutex or
+  DYNAMIC_ARRAY. In this case they should be initialize in the
+  LF_ALLOCATOR::constructor, and lf_hash_insert should not overwrite them.
+  See wt_init() for example.
+*/
+void lf_hash_init(LF_HASH *hash, uint element_size, uint flags,
+                  uint key_offset, uint key_length, hash_get_key get_key,
+                  CHARSET_INFO *charset)
+{
+  lf_alloc_init(&hash->alloc, sizeof(LF_SLIST)+element_size,
+                offsetof(LF_SLIST, key));
+  lf_dynarray_init(&hash->array, sizeof(LF_SLIST *));
+  hash->size= 1;
+  hash->count= 0;
+  hash->element_size= element_size;
+  hash->flags= flags;
+  hash->charset= charset ? charset : &my_charset_bin;
+  hash->key_offset= key_offset;
+  hash->key_length= key_length;
+  hash->get_key= get_key;
+  DBUG_ASSERT(get_key ? !key_offset && !key_length : key_length);
+}
+
+void lf_hash_destroy(LF_HASH *hash)
+{
+  LF_SLIST *el, **head= (LF_SLIST **)_lf_dynarray_value(&hash->array, 0);
+
+  if (unlikely(!head))
+    return;
+  el= *head;
+
+  while (el)
+  {
+    intptr next= el->link;
+    if (el->hashnr & 1)
+      lf_alloc_direct_free(&hash->alloc, el); /* normal node */
+    else
+      my_free((void *)el, MYF(0)); /* dummy node */
+    el= (LF_SLIST *)next;
+  }
+  lf_alloc_destroy(&hash->alloc);
+  lf_dynarray_destroy(&hash->array);
+}
+
+/*
+  DESCRIPTION
+    inserts a new element to a hash. it will have a _copy_ of
+    data, not a pointer to it.
+
+  RETURN
+    0 - inserted
+    1 - didn't (unique key conflict)
+   -1 - out of memory
+
+  NOTE
+    see linsert() for pin usage notes
+*/
+int lf_hash_insert(LF_HASH *hash, LF_PINS *pins, const void *data)
+{
+  int csize, bucket, hashnr;
+  LF_SLIST *node, * volatile *el;
+
+  lf_rwlock_by_pins(pins);
+  node= (LF_SLIST *)_lf_alloc_new(pins);
+  if (unlikely(!node))
+    return -1;
+  memcpy(node+1, data, hash->element_size);
+  node->key= hash_key(hash, (uchar *)(node+1), &node->keylen);
+  hashnr= calc_hash(hash, node->key, node->keylen);
+  bucket= hashnr % hash->size;
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return -1;
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return -1;
+  node->hashnr= my_reverse_bits(hashnr) | 1; /* normal node */
+  if (linsert(el, hash->charset, node, pins, hash->flags))
+  {
+    _lf_alloc_free(pins, node);
+    lf_rwunlock_by_pins(pins);
+    return 1;
+  }
+  csize= hash->size;
+  if ((my_atomic_add32(&hash->count, 1)+1.0) / csize > MAX_LOAD)
+    my_atomic_cas32(&hash->size, &csize, csize*2);
+  lf_rwunlock_by_pins(pins);
+  return 0;
+}
+
+/*
+  DESCRIPTION
+    deletes an element with the given key from the hash (if a hash is
+    not unique and there're many elements with this key - the "first"
+    matching element is deleted)
+  RETURN
+    0 - deleted
+    1 - didn't (not found)
+   -1 - out of memory
+  NOTE
+    see ldelete() for pin usage notes
+*/
+int lf_hash_delete(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen)
+{
+  LF_SLIST * volatile *el;
+  uint bucket, hashnr= calc_hash(hash, (uchar *)key, keylen);
+
+  bucket= hashnr % hash->size;
+  lf_rwlock_by_pins(pins);
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return -1;
+  /*
+    note that we still need to initialize_bucket here,
+    we cannot return "node not found", because an old bucket of that
+    node may've been split and the node was assigned to a new bucket
+    that was never accessed before and thus is not initialized.
+  */
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return -1;
+  if (ldelete(el, hash->charset, my_reverse_bits(hashnr) | 1,
+              (uchar *)key, keylen, pins))
+  {
+    lf_rwunlock_by_pins(pins);
+    return 1;
+  }
+  my_atomic_add32(&hash->count, -1);
+  lf_rwunlock_by_pins(pins);
+  return 0;
+}
+
+/*
+  RETURN
+    a pointer to an element with the given key (if a hash is not unique and
+    there're many elements with this key - the "first" matching element)
+    NULL         if nothing is found
+    MY_ERRPTR    if OOM
+
+  NOTE
+    see lsearch() for pin usage notes
+*/
+void *lf_hash_search(LF_HASH *hash, LF_PINS *pins, const void *key, uint keylen)
+{
+  LF_SLIST * volatile *el, *found;
+  uint bucket, hashnr= calc_hash(hash, (uchar *)key, keylen);
+
+  bucket= hashnr % hash->size;
+  lf_rwlock_by_pins(pins);
+  el= _lf_dynarray_lvalue(&hash->array, bucket);
+  if (unlikely(!el))
+    return MY_ERRPTR;
+  if (*el == NULL && unlikely(initialize_bucket(hash, el, bucket, pins)))
+    return MY_ERRPTR;
+  found= lsearch(el, hash->charset, my_reverse_bits(hashnr) | 1,
+                 (uchar *)key, keylen, pins);
+  lf_rwunlock_by_pins(pins);
+  return found ? found+1 : 0;
+}
+
+static const uchar *dummy_key= (uchar*)"";
+
+/*
+  RETURN
+    0 - ok
+   -1 - out of memory
+*/
+static int initialize_bucket(LF_HASH *hash, LF_SLIST * volatile *node,
+                              uint bucket, LF_PINS *pins)
+{
+  uint parent= my_clear_highest_bit(bucket);
+  LF_SLIST *dummy= (LF_SLIST *)my_malloc(sizeof(LF_SLIST), MYF(MY_WME));
+  LF_SLIST **tmp= 0, *cur;
+  LF_SLIST * volatile *el= _lf_dynarray_lvalue(&hash->array, parent);
+  if (unlikely(!el || !dummy))
+    return -1;
+  if (*el == NULL && bucket &&
+      unlikely(initialize_bucket(hash, el, parent, pins)))
+    return -1;
+  dummy->hashnr= my_reverse_bits(bucket) | 0; /* dummy node */
+  dummy->key= dummy_key;
+  dummy->keylen= 0;
+  if ((cur= linsert(el, hash->charset, dummy, pins, LF_HASH_UNIQUE)))
+  {
+    my_free((void *)dummy, MYF(0));
+    dummy= cur;
+  }
+  my_atomic_casptr((void **)node, (void **)&tmp, dummy);
+  /*
+    note that if the CAS above failed (after linsert() succeeded),
+    it would mean that some other thread has executed linsert() for
+    the same dummy node, its linsert() failed, it picked up our
+    dummy node (in "dummy= cur") and executed the same CAS as above.
+    Which means that even if CAS above failed we don't need to retry,
+    and we should not free(dummy) - there's no memory leak here
+  */
+  return 0;
+}
diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c
index 0f49dd22bb9..6d63f8b8bf5 100644
--- a/mysys/mf_iocache.c
+++ b/mysys/mf_iocache.c
@@ -527,7 +527,7 @@ int _my_b_read(register IO_CACHE *info, uchar *Buffer, size_t Count)
   {
     if (Count)
     {
-      info->error= left_length;		/* We only got this many char */
+      info->error= (int) left_length; /* We only got this many char */
       DBUG_RETURN(1);
     }
     length=0;				/* Didn't read any chars */
@@ -1255,7 +1255,7 @@ read_append_buffer:
     info->append_read_pos += copy_len;
     Count -= copy_len;
     if (Count)
-      info->error = save_count - Count;
+      info->error= (int) (save_count - Count);
 
     /* Fill read buffer with data from write buffer */
     memcpy(info->buffer, info->append_read_pos,
@@ -1644,8 +1644,8 @@ int my_block_write(register IO_CACHE *info, const uchar *Buffer, size_t Count,
   {
     /* Of no overlap, write everything without buffering */
     if (pos + Count <= info->pos_in_file)
-      return my_pwrite(info->file, Buffer, Count, pos,
-		       info->myflags | MY_NABP);
+      return (int) my_pwrite(info->file, Buffer, Count, pos,
+                             info->myflags | MY_NABP);
     /* Write the part of the block that is before buffer */
     length= (uint) (info->pos_in_file - pos);
     if (my_pwrite(info->file, Buffer, length, pos, info->myflags | MY_NABP))
@@ -1834,6 +1834,9 @@ int end_io_cache(IO_CACHE *info)
     pthread_mutex_destroy(&info->append_buffer_lock);
 #endif
   }
+#ifdef THREAD
+  info->share= 0;
+#endif
   DBUG_RETURN(error);
 } /* end_io_cache */
 
diff --git a/mysys/mf_iocache2.c b/mysys/mf_iocache2.c
index c54c7d13548..728501e6c50 100644
--- a/mysys/mf_iocache2.c
+++ b/mysys/mf_iocache2.c
@@ -420,9 +420,9 @@ process_flags:
       /* minimum width padding */
       if (minimum_width > length2) 
       {
-        char *buffz;
+        uchar *buffz;
                     
-        buffz= my_alloca(minimum_width - length2);
+        buffz= (uchar*) my_alloca(minimum_width - length2);
         if (is_zero_padded)
           memset(buffz, '0', minimum_width - length2);
         else
diff --git a/mysys/mf_keycache.c b/mysys/mf_keycache.c
index 397a3332740..ea0c97f5913 100644
--- a/mysys/mf_keycache.c
+++ b/mysys/mf_keycache.c
@@ -2569,7 +2569,7 @@ uchar *key_cache_read(KEY_CACHE *keycache,
     do
     {
       /* Cache could be disabled in a later iteration. */
-
+      
       if (!keycache->can_be_used)
 	goto no_key_cache;
       /* Start reading at the beginning of the cache block. */
@@ -3174,7 +3174,7 @@ int key_cache_write(KEY_CACHE *keycache,
 
       if (!dont_write)
       {
-	/* Not used in the server. buff has been written to disk at start. */
+        /* Not used in the server. buff has been written to disk at start. */
         if ((block->status & BLOCK_CHANGED) &&
             (!offset && read_length >= keycache->key_cache_block_size))
              link_to_file_list(keycache, block, block->hash_link->file, 1);
@@ -3558,10 +3558,11 @@ static int flush_key_blocks_int(KEY_CACHE *keycache,
               file, keycache->blocks_used, keycache->blocks_changed));
 
 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
-    DBUG_EXECUTE("check_keycache",
-                 test_key_cache(keycache, "start of flush_key_blocks", 0););
+  DBUG_EXECUTE("check_keycache",
+               test_key_cache(keycache, "start of flush_key_blocks", 0););
 #endif
 
+  DBUG_ASSERT(type != FLUSH_KEEP_LAZY);
   cache= cache_buff;
   if (keycache->disk_blocks > 0 &&
       (!my_disable_flush_key_blocks || type != FLUSH_KEEP))
diff --git a/mysys/mf_keycaches.c b/mysys/mf_keycaches.c
index 6227a05ce06..9ea5678da9a 100644
--- a/mysys/mf_keycaches.c
+++ b/mysys/mf_keycaches.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003 MySQL AB
+/* Copyright (C) 2003-2007 MySQL AB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -25,269 +25,7 @@
 #include <keycache.h>
 #include <hash.h>
 #include <m_string.h>
-
-/*****************************************************************************
-  General functions to handle SAFE_HASH objects.
-
-  A SAFE_HASH object is used to store the hash, the mutex and default value
-  needed by the rest of the key cache code.
-  This is a separate struct to make it easy to later reuse the code for other
-  purposes
-
-  All entries are linked in a list to allow us to traverse all elements
-  and delete selected ones. (HASH doesn't allow any easy ways to do this).
-*****************************************************************************/
-
-/*
-  Struct to store a key and pointer to object
-*/
-
-typedef struct st_safe_hash_entry
-{
-  uchar *key;
-  uint length;
-  uchar *data;
-  struct st_safe_hash_entry *next, **prev;
-} SAFE_HASH_ENTRY;
-
-
-typedef struct st_safe_hash_with_default
-{
-#ifdef THREAD
-  rw_lock_t mutex;
-#endif
-  HASH hash;
-  uchar *default_value;
-  SAFE_HASH_ENTRY *root;
-} SAFE_HASH;
-
-
-/*
-  Free a SAFE_HASH_ENTRY
-
-  This function is called by the hash object on delete
-*/
-
-static void safe_hash_entry_free(SAFE_HASH_ENTRY *entry)
-{
-  DBUG_ENTER("free_assign_entry");
-  my_free((uchar*) entry, MYF(0));
-  DBUG_VOID_RETURN;
-}
-
-
-/* Get key and length for a SAFE_HASH_ENTRY */
-
-static uchar *safe_hash_entry_get(SAFE_HASH_ENTRY *entry, size_t *length,
-                                  my_bool not_used __attribute__((unused)))
-{
-  *length=entry->length;
-  return (uchar*) entry->key;
-}
-
-
-/*
-  Init a SAFE_HASH object
-
-  SYNOPSIS
-    safe_hash_init()
-    hash		safe_hash handler
-    elements		Expected max number of elements
-    default_value	default value
-
-  NOTES
-    In case of error we set hash->default_value to 0 to allow one to call
-    safe_hash_free on an object that couldn't be initialized.
-
-  RETURN
-    0  ok
-    1  error
-*/
-
-static my_bool safe_hash_init(SAFE_HASH *hash, uint elements,
-			      uchar *default_value)
-{
-  DBUG_ENTER("safe_hash");
-  if (hash_init(&hash->hash, &my_charset_bin, elements,
-		0, 0, (hash_get_key) safe_hash_entry_get,
-		(void (*)(void*)) safe_hash_entry_free, 0))
-  {
-    hash->default_value= 0;
-    DBUG_RETURN(1);
-  }
-  my_rwlock_init(&hash->mutex, 0);
-  hash->default_value= default_value;
-  hash->root= 0;
-  DBUG_RETURN(0);
-}
-
-
-/*
-  Free a SAFE_HASH object
-
-  NOTES
-    This is safe to call on any object that has been sent to safe_hash_init()
-*/
-
-static void safe_hash_free(SAFE_HASH *hash)
-{
-  /*
-    Test if safe_hash_init succeeded. This will also guard us against multiple
-    free calls.
-  */
-  if (hash->default_value)
-  {
-    hash_free(&hash->hash);
-    rwlock_destroy(&hash->mutex);
-    hash->default_value=0;
-  }
-}
-
-/*
-  Return the value stored for a key or default value if no key
-*/
-
-static uchar *safe_hash_search(SAFE_HASH *hash, const uchar *key, uint length)
-{
-  uchar *result;
-  DBUG_ENTER("safe_hash_search");
-  rw_rdlock(&hash->mutex);
-  result= hash_search(&hash->hash, key, length);
-  rw_unlock(&hash->mutex);
-  if (!result)
-    result= hash->default_value;
-  else
-    result= ((SAFE_HASH_ENTRY*) result)->data;
-  DBUG_PRINT("exit",("data: 0x%lx", (long) result));
-  DBUG_RETURN(result);
-}
-
-
-/*
-  Associate a key with some data
-
-  SYONOPSIS
-    safe_hash_set()
-    hash			Hash handle
-    key				key (path to table etc..)
-    length			Length of key
-    data			data to to associate with the data
-
-  NOTES
-    This can be used both to insert a new entry and change an existing
-    entry.
-    If one associates a key with the default key cache, the key is deleted
-
-  RETURN
-    0  ok
-    1  error (Can only be EOM). In this case my_message() is called.
-*/
-
-static my_bool safe_hash_set(SAFE_HASH *hash, const uchar *key, uint length,
-			     uchar *data)
-{
-  SAFE_HASH_ENTRY *entry;
-  my_bool error= 0;
-  DBUG_ENTER("safe_hash_set");
-  DBUG_PRINT("enter",("key: %.*s  data: 0x%lx", length, key, (long) data));
-
-  rw_wrlock(&hash->mutex);
-  entry= (SAFE_HASH_ENTRY*) hash_search(&hash->hash, key, length);
-
-  if (data == hash->default_value)
-  {
-    /*
-      The key is to be associated with the default entry. In this case
-      we can just delete the entry (if it existed) from the hash as a
-      search will return the default entry
-    */
-    if (!entry)					/* nothing to do */
-      goto end;
-    /* unlink entry from list */
-    if ((*entry->prev= entry->next))
-      entry->next->prev= entry->prev;
-    hash_delete(&hash->hash, (uchar*) entry);
-    goto end;
-  }
-  if (entry)
-  {
-    /* Entry existed;  Just change the pointer to point at the new data */
-    entry->data= data;
-  }
-  else
-  {
-    if (!(entry= (SAFE_HASH_ENTRY *) my_malloc(sizeof(*entry) + length,
-					       MYF(MY_WME))))
-    {
-      error= 1;
-      goto end;
-    }
-    entry->key= (uchar*) (entry +1);
-    memcpy((char*) entry->key, (char*) key, length);
-    entry->length= length;
-    entry->data= data;
-    /* Link entry to list */
-    if ((entry->next= hash->root))
-      entry->next->prev= &entry->next;
-    entry->prev= &hash->root;
-    hash->root= entry;
-    if (my_hash_insert(&hash->hash, (uchar*) entry))
-    {
-      /* This can only happen if hash got out of memory */
-      my_free((char*) entry, MYF(0));
-      error= 1;
-      goto end;
-    }
-  }
-
-end:
-  rw_unlock(&hash->mutex);
-  DBUG_RETURN(error);
-}
-
-
-/*
-  Change all entres with one data value to another data value
-
-  SYONOPSIS
-    safe_hash_change()
-    hash			Hash handle
-    old_data			Old data
-    new_data			Change all 'old_data' to this
-
-  NOTES
-    We use the linked list to traverse all elements in the hash as
-    this allows us to delete elements in the case where 'new_data' is the
-    default value.
-*/
-
-static void safe_hash_change(SAFE_HASH *hash, uchar *old_data, uchar *new_data)
-{
-  SAFE_HASH_ENTRY *entry, *next;
-  DBUG_ENTER("safe_hash_set");
-
-  rw_wrlock(&hash->mutex);
-
-  for (entry= hash->root ; entry ; entry= next)
-  {
-    next= entry->next;
-    if (entry->data == old_data)
-    {
-      if (new_data == hash->default_value)
-      {
-        if ((*entry->prev= entry->next))
-          entry->next->prev= entry->prev;
-	hash_delete(&hash->hash, (uchar*) entry);
-      }
-      else
-	entry->data= new_data;
-    }
-  }
-
-  rw_unlock(&hash->mutex);
-  DBUG_VOID_RETURN;
-}
-
+#include "my_safehash.h"
 
 /*****************************************************************************
   Functions to handle the key cache objects
@@ -315,6 +53,7 @@ void multi_keycache_free(void)
     multi_key_cache_search()
     key				key to find (usually table path)
     uint length			Length of key.
+    def				Default value if no key cache
 
   NOTES
     This function is coded in such a way that we will return the
@@ -325,11 +64,13 @@ void multi_keycache_free(void)
     key cache to use
 */
 
-KEY_CACHE *multi_key_cache_search(uchar *key, uint length)
+KEY_CACHE *multi_key_cache_search(uchar *key, uint length,
+                                  KEY_CACHE *def)
 {
   if (!key_cache_hash.hash.records)
-    return dflt_key_cache;
-  return (KEY_CACHE*) safe_hash_search(&key_cache_hash, key, length);
+    return def;
+  return (KEY_CACHE*) safe_hash_search(&key_cache_hash, key, length,
+                                       (void*) def);
 }
 
 
@@ -361,3 +102,5 @@ void multi_key_cache_change(KEY_CACHE *old_data,
 {
   safe_hash_change(&key_cache_hash, (uchar*) old_data, (uchar*) new_data);
 }
+
+
diff --git a/mysys/my_bitmap.c b/mysys/my_bitmap.c
index e127b2584ae..137127a2fda 100644
--- a/mysys/my_bitmap.c
+++ b/mysys/my_bitmap.c
@@ -508,10 +508,8 @@ uint bitmap_get_first_set(const MY_BITMAP *map)
             if (*byte_ptr & (1 << k))
               return (i*32) + (j*8) + k;
           }
-          DBUG_ASSERT(0);
         }
       }
-      DBUG_ASSERT(0);
     }
   }
   return MY_BIT_NONE;
@@ -542,10 +540,8 @@ uint bitmap_get_first(const MY_BITMAP *map)
             if (!(*byte_ptr & (1 << k)))
               return (i*32) + (j*8) + k;
           }
-          DBUG_ASSERT(0);
         }
       }
-      DBUG_ASSERT(0);
     }
   }
   return MY_BIT_NONE;
diff --git a/mysys/my_chmod.c b/mysys/my_chmod.c
new file mode 100644
index 00000000000..afdea758833
--- /dev/null
+++ b/mysys/my_chmod.c
@@ -0,0 +1,48 @@
+/* Copyright (C) 2000 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "mysys_priv.h"
+#include "mysys_err.h"
+
+/**
+   @brief Change mode of file.
+
+   @fn my_chmod()
+   @param name  	Filename
+   @param mode_t        Mode
+   @param my_flags	Flags
+
+   @notes
+   The  mode of the file given by path or referenced by fildes is changed
+
+   @retval 0	Ok
+   @retval #	Error
+*/
+
+int my_chmod(const char *name, mode_t mode, myf my_flags)
+{
+  DBUG_ENTER("my_chmod");
+  DBUG_PRINT("my",("name: %s  mode: %lu  flags: %d", name, (ulong) mode,
+                   my_flags));
+
+  if (chmod(name, mode))
+  {
+    my_errno= errno;
+    if (my_flags & MY_WME)
+      my_error(EE_CANT_CHMOD, MYF(0), name, (ulong) mode, my_errno);
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
diff --git a/mysys/my_compress.c b/mysys/my_compress.c
index a3d9d56915e..45c4ab983cc 100644
--- a/mysys/my_compress.c
+++ b/mysys/my_compress.c
@@ -67,7 +67,7 @@ uchar *my_compress_alloc(const uchar *packet, size_t *len, size_t *complen)
   if (!(compbuf= (uchar *) my_malloc(*complen, MYF(MY_WME))))
     return 0;					/* Not enough memory */
 
-  tmp_complen= (uint) *complen;
+  tmp_complen= (uLongf) *complen;
   res= compress((Bytef*) compbuf, &tmp_complen, (Bytef*) packet, (uLong) *len);
   *complen=    tmp_complen;
 
@@ -118,7 +118,7 @@ my_bool my_uncompress(uchar *packet, size_t len, size_t *complen)
     if (!compbuf)
       DBUG_RETURN(1);				/* Not enough memory */
 
-    tmp_complen= (uint) *complen;
+    tmp_complen= (uLongf) *complen;
     error= uncompress((Bytef*) compbuf, &tmp_complen, (Bytef*) packet,
                       (uLong) len);
     *complen= tmp_complen;
diff --git a/mysys/my_error.c b/mysys/my_error.c
index 2cf704d0089..4e04d4fadc2 100644
--- a/mysys/my_error.c
+++ b/mysys/my_error.c
@@ -89,11 +89,11 @@ int my_error(int nr, myf MyFlags, ...)
   /* get the error message string. Default, if NULL or empty string (""). */
   if (! (format= (meh_p && (nr >= meh_p->meh_first)) ?
          meh_p->meh_errmsgs[nr - meh_p->meh_first] : NULL) || ! *format)
-    (void) my_snprintf (ebuff, sizeof(ebuff), "Unknown error %d", nr);
+    (void) my_snprintf(ebuff, sizeof(ebuff), "Unknown error %d", nr);
   else
   {
     va_start(args,MyFlags);
-    (void) my_vsnprintf (ebuff, sizeof(ebuff), format, args);
+    (void) my_vsnprintf(ebuff, sizeof(ebuff), format, args);
     va_end(args);
   }
   DBUG_RETURN((*error_handler_hook)(nr, ebuff, MyFlags));
@@ -116,15 +116,39 @@ int my_printf_error(uint error, const char *format, myf MyFlags, ...)
   va_list args;
   char ebuff[ERRMSGSIZE];
   DBUG_ENTER("my_printf_error");
-  DBUG_PRINT("my", ("nr: %d  MyFlags: %d  errno: %d  Format: %s",
+  DBUG_PRINT("my", ("nr: %d  MyFlags: %d  errno: %d  format: %s",
 		    error, MyFlags, errno, format));
 
   va_start(args,MyFlags);
-  (void) my_vsnprintf (ebuff, sizeof(ebuff), format, args);
+  (void) my_vsnprintf(ebuff, sizeof(ebuff), format, args);
   va_end(args);
   DBUG_RETURN((*error_handler_hook)(error, ebuff, MyFlags));
 }
 
+
+/*
+  Error with va_list
+
+  SYNOPSIS
+    my_printv_error()
+      error	Errno
+      format	Format string
+      MyFlags	Flags
+      ...	variable list
+*/
+
+int my_printv_error(uint error, const char *format, myf MyFlags, va_list ap)
+{
+  char ebuff[ERRMSGSIZE+20];
+  DBUG_ENTER("my_printv_error");
+  DBUG_PRINT("my", ("nr: %d  MyFlags: %d  errno: %d  format: %s",
+		    error, MyFlags, errno, format));
+
+  (void) my_vsnprintf(ebuff, sizeof(ebuff), format, ap);
+  DBUG_RETURN((*error_handler_hook)(error, ebuff, MyFlags));
+}
+
+
 /*
   Give message using error_handler_hook
 
diff --git a/mysys/my_fopen.c b/mysys/my_fopen.c
index 44156da6ae3..351851cca76 100644
--- a/mysys/my_fopen.c
+++ b/mysys/my_fopen.c
@@ -134,7 +134,7 @@ FILE *my_fdopen(File Filedes, const char *name, int Flags, myf MyFlags)
   FILE *fd;
   char type[5];
   DBUG_ENTER("my_fdopen");
-  DBUG_PRINT("my",("Fd: %d  Flags: %d  MyFlags: %d",
+  DBUG_PRINT("my",("fd: %d  Flags: %d  MyFlags: %d",
 		   Filedes, Flags, MyFlags));
 
   make_ftype(type,Flags);
diff --git a/mysys/my_getncpus.c b/mysys/my_getncpus.c
index 82e87dee2e4..4cb96ac0bca 100644
--- a/mysys/my_getncpus.c
+++ b/mysys/my_getncpus.c
@@ -16,24 +16,26 @@
 /* get the number of (online) CPUs */
 
 #include "mysys_priv.h"
+#ifdef HAVE_UNISTD_H
 #include <unistd.h>
+#endif
 
 static int ncpus=0;
 
-#ifdef _SC_NPROCESSORS_ONLN
 int my_getncpus()
 {
   if (!ncpus)
+  {
+#ifdef _SC_NPROCESSORS_ONLN
     ncpus= sysconf(_SC_NPROCESSORS_ONLN);
-  return ncpus;
-}
-
+#elif defined(__WIN__)
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    ncpus= sysinfo.dwNumberOfProcessors;
 #else
-/* unknown */
-int my_getncpus()
-{
-  return 2;
-}
-
+/* unknown so play safe: assume SMP and forbid uniprocessor build */
+    ncpus= 2;
 #endif
-
+  }
+  return ncpus;
+}
diff --git a/mysys/my_getopt.c b/mysys/my_getopt.c
index 4b74cdbf266..da7e997d629 100644
--- a/mysys/my_getopt.c
+++ b/mysys/my_getopt.c
@@ -28,23 +28,22 @@ static void default_reporter(enum loglevel level, const char *format, ...);
 my_error_reporter my_getopt_error_reporter= &default_reporter;
 
 static int findopt(char *optpat, uint length,
-		   const struct my_option **opt_res,
-		   char **ffname);
+                   const struct my_option **opt_res,
+                   char **ffname);
 my_bool getopt_compare_strings(const char *s,
-			       const char *t,
-			       uint length);
+                               const char *t,
+                               uint length);
 static longlong getopt_ll(char *arg, const struct my_option *optp, int *err);
 static ulonglong getopt_ull(char *arg, const struct my_option *optp,
-			    int *err);
+                            int *err);
 static double getopt_double(char *arg, const struct my_option *optp, int *err);
 static void init_variables(const struct my_option *options,
                            init_func_p init_one_value);
-static void init_one_value(const struct my_option *option, uchar* *variable,
-			   longlong value);
+static void init_one_value(const struct my_option *opt, uchar* *, longlong);
 static void fini_one_value(const struct my_option *option, uchar* *variable,
 			   longlong value);
-static int setval(const struct my_option *opts, uchar* *value, char *argument,
-		  my_bool set_maximum_value);
+static int setval(const struct my_option *opts, uchar **value, char *argument,
+                  my_bool set_maximum_value);
 static char *check_struct_option(char *cur_arg, char *key_name);
 
 /*
@@ -779,7 +778,7 @@ static longlong eval_num_suffix(char *argument, int *error, char *option_name)
   return num;
 }
 
-/* 
+/*
   function: getopt_ll
 
   Evaluates and returns the value that user gave as an argument
@@ -928,7 +927,6 @@ ulonglong getopt_ull_limit_value(ulonglong num, const struct my_option *optp,
     my_getopt_error_reporter(WARNING_LEVEL,
                              "option '%s': unsigned value %s adjusted to %s",
                              optp->name, ullstr(old, buf1), ullstr(num, buf2));
-
   return num;
 }
 
@@ -969,8 +967,8 @@ static double getopt_double(char *arg, const struct my_option *optp, int *err)
 
   SYNOPSIS
     init_one_value()
-    option		Option to initialize
-    value		Pointer to variable
+    option              Option to initialize
+    value               Pointer to variable
 */
 
 static void init_one_value(const struct my_option *option, uchar* *variable,
@@ -999,7 +997,7 @@ static void init_one_value(const struct my_option *option, uchar* *variable,
   case GET_LL:
     *((longlong*) variable)= (longlong) getopt_ll_limit_value((longlong) value, option, NULL);
     break;
-  case GET_ULL:
+  case GET_ULL: /* Fall through */
   case GET_SET:
     *((ulonglong*) variable)= (ulonglong) getopt_ull_limit_value((ulonglong) value, option, NULL);
     break;
@@ -1067,7 +1065,7 @@ void my_cleanup_options(const struct my_option *options)
 }
 
 
-/* 
+/*
   initialize all variables to their default values
 
   SYNOPSIS
@@ -1237,7 +1235,7 @@ void my_print_variables(const struct my_option *options)
 	printf("%d\n", *((int*) value));
 	break;
       case GET_UINT:
-	printf("%d\n", *((uint*) value));
+	printf("%u\n", *((uint*) value));
 	break;
       case GET_LONG:
 	printf("%ld\n", *((long*) value));
diff --git a/mysys/my_getsystime.c b/mysys/my_getsystime.c
index b692b18bfc7..64480c4aa7a 100644
--- a/mysys/my_getsystime.c
+++ b/mysys/my_getsystime.c
@@ -222,4 +222,3 @@ time_t my_time_possible_from_micro(ulonglong microtime __attribute__((unused)))
   return (time_t) (microtime / 1000000);
 #endif  /* defined(__WIN__) */
 }
-
diff --git a/mysys/my_handler.c b/mysys/my_handler.c
index 3bc27b622cb..7c13149cb27 100644
--- a/mysys/my_handler.c
+++ b/mysys/my_handler.c
@@ -20,26 +20,27 @@
 #include <my_base.h>
 #include <my_handler.h>
 #include <my_sys.h>
-
 #include "my_handler_errors.h"
 
-int ha_compare_text(CHARSET_INFO *charset_info, uchar *a, uint a_length,
-		    uchar *b, uint b_length, my_bool part_key,
+int ha_compare_text(CHARSET_INFO *charset_info, const uchar *a, uint a_length,
+		    const uchar *b, uint b_length, my_bool part_key,
 		    my_bool skip_end_space)
 {
   if (!part_key)
     return charset_info->coll->strnncollsp(charset_info, a, a_length,
-                                           b, b_length, (my_bool)!skip_end_space);
+                                           b, b_length,
+                                           (my_bool)!skip_end_space);
   return charset_info->coll->strnncoll(charset_info, a, a_length,
                                        b, b_length, part_key);
 }
 
 
-static int compare_bin(uchar *a, uint a_length, uchar *b, uint b_length,
+static int compare_bin(const uchar *a, uint a_length,
+                       const uchar *b, uint b_length,
                        my_bool part_key, my_bool skip_end_space)
 {
   uint length= min(a_length,b_length);
-  uchar *end= a+ length;
+  const uchar *end= a+ length;
   int flag;
 
   while (a < end)
@@ -83,13 +84,15 @@ static int compare_bin(uchar *a, uint a_length, uchar *b, uint b_length,
     ha_key_cmp()
     keyseg	Array of key segments of key to compare
     a		First key to compare, in format from _mi_pack_key()
-		This is normally key specified by user
-    b		Second key to compare.  This is always from a row
-    key_length	Length of key to compare.  This can be shorter than
-		a to just compare sub keys
+		This is always from the row
+    b		Second key to compare.  This is from the row or the user
+    key_length	Length of key to compare, based on key b.  This can be shorter
+		than b to just compare sub keys
     next_flag	How keys should be compared
 		If bit SEARCH_FIND is not set the keys includes the row
 		position and this should also be compared
+                If SEARCH_PAGE_KEY_HAS_TRANSID is set then 'a' has transid
+                If SEARCH_USER_KEY_HAS_TRANSID is set then 'b' has transid
     diff_pos    OUT Number of first keypart where values differ, counting 
                 from one.
     diff_pos[1] OUT  (b + diff_pos[1]) points to first value in tuple b
@@ -118,8 +121,8 @@ static int compare_bin(uchar *a, uint a_length, uchar *b, uint b_length,
 
 #define FCMP(A,B) ((int) (A) - (int) (B))
 
-int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
-	       register uchar *b, uint key_length, uint nextflag,
+int ha_key_cmp(register HA_KEYSEG *keyseg, register const uchar *a,
+	       register const uchar *b, uint key_length, uint32 nextflag,
 	       uint *diff_pos)
 {
   int flag;
@@ -129,12 +132,12 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
   float f_1,f_2;
   double d_1,d_2;
   uint next_key_length;
-  uchar *orig_b= b;
+  const uchar *orig_b= b;
 
   *diff_pos=0;
   for ( ; (int) key_length >0 ; key_length=next_key_length, keyseg++)
   {
-    uchar *end;
+    const uchar *end;
     uint piks=! (keyseg->flag & HA_NO_SORT);
     (*diff_pos)++;
     diff_pos[1]= (uint)(b - orig_b);
@@ -151,8 +154,13 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
       b++;
       if (!*a++)                                /* If key was NULL */
       {
-        if (nextflag == (SEARCH_FIND | SEARCH_UPDATE))
-          nextflag=SEARCH_SAME;                 /* Allow duplicate keys */
+        if ((nextflag & (SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT |
+                         SEARCH_NULL_ARE_EQUAL)) ==
+            (SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT))
+        {
+          /* Allow duplicate keys */
+          nextflag= (nextflag & ~(SEARCH_FIND | SEARCH_UPDATE)) | SEARCH_SAME;
+        }
   	else if (nextflag & SEARCH_NULL_ARE_NOT_EQUAL)
 	{
 	  /*
@@ -366,7 +374,7 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
 
       if (keyseg->flag & HA_REVERSE_SORT)
       {
-        swap_variables(uchar*, a, b);
+        swap_variables(const uchar*, a, b);
         swap_flag=1;                            /* Remember swap of a & b */
         end= a+ (int) (end-b);
       }
@@ -391,7 +399,7 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
 	  if (*b != '-')
 	    return -1;
 	  a++; b++;
-	  swap_variables(uchar*, a, b);
+	  swap_variables(const uchar*, a, b);
 	  swap_variables(int, alength, blength);
 	  swap_flag=1-swap_flag;
 	  alength--; blength--;
@@ -420,7 +428,7 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
       }
 
       if (swap_flag)                            /* Restore pointers */
-        swap_variables(uchar*, a, b);
+        swap_variables(const uchar*, a, b);
       break;
     }
 #ifdef HAVE_LONG_LONG
@@ -455,18 +463,90 @@ int ha_key_cmp(register HA_KEYSEG *keyseg, register uchar *a,
 end:
   if (!(nextflag & SEARCH_FIND))
   {
+    /*
+      Compare rowid and possible transid
+      This happens in the following case:
+      - INSERT, UPDATE, DELETE when we have not unique keys or
+        are using versioning
+      - SEARCH_NEXT, SEARCH_PREVIOUS when we need to restart search
+
+      The logic for comparing transid are as follows:
+      Keys with have a transid have lowest bit in the rowidt. This means that
+      if we are comparing a key with a transid with another key that doesn't
+      have a tranid, we must reset the lowest bit for both keys.
+
+      When we have transid, the keys are compared in transid order.
+      A key without a transid is regared to be smaller than a key with
+      a transid.
+    */
+
     uint i;
+    uchar key_mask, tmp_a, tmp_b;
+
     if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST)) /* Find record after key */
       return (nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1;
-    flag=0;
-    for (i=keyseg->length ; i-- > 0 ; )
+    key_mask= (uchar) 255;
+
+    if (!(nextflag & (SEARCH_USER_KEY_HAS_TRANSID |
+                      SEARCH_PAGE_KEY_HAS_TRANSID)))
+    {
+      /*
+        Neither key has a trid.  Only compare row id's and don't
+        try to store rows in trid order
+      */
+      key_length= keyseg->length;
+      nextflag&= ~SEARCH_INSERT;
+    }
+    else
+    {
+      /*
+        Set key_mask so that we reset the last bit in the rowid before
+        we compare it. This is needed as the lowest bit in the rowid is
+        used to mark if the key has a transid or not.
+      */
+      key_mask= (uchar) 254;
+      if (!test_all_bits(nextflag, (SEARCH_USER_KEY_HAS_TRANSID |
+                                    SEARCH_PAGE_KEY_HAS_TRANSID)))
+      {
+        /*
+          No transaction id for user key or for key on page 
+          Ignore transid as at least one of the keys are visible for all
+        */
+        key_length= keyseg->length;
+      }
+      else
+      {
+        /*
+          Both keys have trids. No need of special handling of incomplete
+          trids below.
+        */
+        nextflag&= ~SEARCH_INSERT;
+      }
+    }
+    DBUG_ASSERT(key_length > 0);
+
+    for (i= key_length-1 ; (int) i-- > 0 ; )
     {
       if (*a++ != *b++)
       {
         flag= FCMP(a[-1],b[-1]);
-        break;
+        goto found;
       }
     }
+    tmp_a= *a & key_mask;
+    tmp_b= *b & key_mask;
+    flag= FCMP(tmp_a, tmp_b);
+
+    if (flag == 0 && (nextflag & SEARCH_INSERT))
+    {
+      /*
+        Ensure that on insert we get rows stored in trid order.
+        If one of the parts doesn't have a trid, this should be regarded
+        as smaller than the other
+      */
+        return (nextflag & SEARCH_USER_KEY_HAS_TRANSID) ? -1 : 1;
+    }
+found:
     if (nextflag & SEARCH_SAME)
       return (flag);                            /* read same */
     if (nextflag & SEARCH_BIGGER)
@@ -498,11 +578,11 @@ end:
     NULLs.
 */
 
-HA_KEYSEG *ha_find_null(HA_KEYSEG *keyseg, uchar *a)
+HA_KEYSEG *ha_find_null(HA_KEYSEG *keyseg, const uchar *a)
 {
   for (; (enum ha_base_keytype) keyseg->type != HA_KEYTYPE_END; keyseg++)
   {
-    uchar *end;
+    const uchar *end;
     if (keyseg->null_bit)
     {
       if (!*a++)
@@ -567,7 +647,6 @@ HA_KEYSEG *ha_find_null(HA_KEYSEG *keyseg, uchar *a)
 }
 
 
-
 /*
   Register handler error messages for usage with my_error()
 
@@ -576,7 +655,6 @@ HA_KEYSEG *ha_find_null(HA_KEYSEG *keyseg, uchar *a)
     will ignore calls to register already registered error numbers.
 */
 
-
 void my_handler_error_register(void)
 {
   /*
diff --git a/mysys/my_handler_errors.h b/mysys/my_handler_errors.h
index e360af8c57e..4c952466545 100644
--- a/mysys/my_handler_errors.h
+++ b/mysys/my_handler_errors.h
@@ -32,7 +32,7 @@ static const char *handler_error_messages[]=
   "Table is crashed and last repair failed",
   "Table was marked as crashed and should be repaired",
   "Lock timed out; Retry transaction",
-  "Lock table is full;  Restart program with a larger locktable",
+  "Lock table is full;  Restart program with a larger lock table",
   "Updates are not allowed under a read only transactions",
   "Lock deadlock; Retry transaction",
   "Foreign key constraint is incorrectly formed",
@@ -46,7 +46,7 @@ static const char *handler_error_messages[]=
   "Unexpected null pointer found when using spatial index",
   "The table changed in storage engine",
   "There's no partition in table for the given value",
-  "Row-based binlogging of row failed",
+  "Row-based binary logging of row failed",
   "Index needed in foreign key constraint",
   "Upholding foreign key constraints would lead to a duplicate key error in "
   "some other table",
@@ -55,13 +55,14 @@ static const char *handler_error_messages[]=
   "Failed to get next auto increment value",
   "Failed to set row auto increment value",
   "Unknown (generic) error from engine",
-  "Record is the same",
+  "Record was not update. Original values was same as new values",
   "It is not possible to log this statement",
   "The event was corrupt, leading to illegal data being read",
   "The table is of a new format not supported by this version",
-  "The event could not be processed no other hanlder error happened",
-  "Got a fatal error during initialzaction of handler",
-  "File to short; Expected more data in file",
-  "Read page with wrong checksum"
+  "The event could not be processed. No other handler error happened",
+  "Got a fatal error during initialization of handler",
+  "File too short; Expected more data in file",
+  "Read page with wrong checksum",
+  "Row is not visible by the current transaction"
 };
 
diff --git a/mysys/my_init.c b/mysys/my_init.c
index b330ffac65a..0cf5c1b7e36 100644
--- a/mysys/my_init.c
+++ b/mysys/my_init.c
@@ -77,21 +77,21 @@ my_bool my_init(void)
   my_umask= 0660;                       /* Default umask for new files */
   my_umask_dir= 0700;                   /* Default umask for new directories */
   init_glob_errs();
-#if defined(THREAD)
-  if (my_thread_global_init())
-    return 1;
-#  if defined(SAFE_MUTEX)
-  safe_mutex_global_init();		/* Must be called early */
-#  endif
-#endif
-#if defined(THREAD) && defined(MY_PTHREAD_FASTMUTEX) && !defined(SAFE_MUTEX)
-  fastmutex_global_init();              /* Must be called early */
-#endif
+  my_progname_short= "unknown";
+  if (my_progname)
+    my_progname_short= my_progname + dirname_length(my_progname);
+
+  /* first initialize systems and global libraries */
   netware_init();
 #ifdef THREAD
 #if defined(HAVE_PTHREAD_INIT)
   pthread_init();			/* Must be called before DBUG_ENTER */
 #endif
+  /* Initalize our mutex handling */
+  my_mutex_init();
+  /* Initialize mysys global variables and global mutex */
+  if (my_thread_global_init())
+    return 1;
 #if !defined( __WIN__) && !defined(__NETWARE__)
   sigfillset(&my_signals);		/* signals blocked by mf_brkhant */
 #endif
@@ -164,6 +164,9 @@ void my_end(int infoflag)
   free_charsets();
   my_error_unregister_all();
   my_once_free();
+#ifdef THREAD
+  my_thread_destroy_mutex();
+#endif
 
   if ((infoflag & MY_GIVE_INFO) || print_info)
   {
@@ -194,6 +197,10 @@ Voluntary context switches %ld, Involuntary context switches %ld\n",
     fprintf(info_file,"\nRun time: %.1f\n",(double) clock()/CLOCKS_PER_SEC);
 #endif
 #if defined(SAFEMALLOC)
+    /* Wait for other threads to free mysys_var */
+#ifdef THREAD
+    (void) my_wait_for_other_threads_to_die(1);
+#endif
     TERMINATE(stderr, (infoflag & MY_GIVE_INFO) != 0);
 #elif defined(__WIN__) && defined(_MSC_VER)
    _CrtSetReportMode( _CRT_WARN, _CRTDBG_MODE_FILE );
@@ -218,6 +225,7 @@ Voluntary context switches %ld, Involuntary context switches %ld\n",
 #ifdef THREAD
   my_thread_end();
   my_thread_global_end();
+  my_mutex_end();
 #if defined(SAFE_MUTEX)
   /*
     Check on destroying of mutexes. A few may be left that will get cleaned
@@ -235,6 +243,13 @@ Voluntary context switches %ld, Involuntary context switches %ld\n",
   my_init_done=0;
 } /* my_end */
 
+#ifndef DBUG_OFF
+/* Dummy tag function for debugging */
+
+void my_debug_put_break_here(void)
+{
+}
+#endif
 
 #ifdef __WIN__
 
diff --git a/mysys/my_lock.c b/mysys/my_lock.c
index c0522ee849d..8450fcfc30a 100644
--- a/mysys/my_lock.c
+++ b/mysys/my_lock.c
@@ -49,12 +49,12 @@ int my_lock(File fd, int locktype, my_off_t start, my_off_t length,
   int nxErrno;
 #endif
   DBUG_ENTER("my_lock");
-  DBUG_PRINT("my",("Fd: %d  Op: %d  start: %ld  Length: %ld  MyFlags: %d",
+  DBUG_PRINT("my",("fd: %d  Op: %d  start: %ld  Length: %ld  MyFlags: %d",
 		   fd,locktype,(long) start,(long) length,MyFlags));
 #ifdef VMS
   DBUG_RETURN(0);
 #else
-  if (my_disable_locking)
+  if (my_disable_locking && ! (MyFlags & MY_FORCE_LOCK))
     DBUG_RETURN(0);
 
 #if defined(__NETWARE__)
@@ -87,7 +87,7 @@ int my_lock(File fd, int locktype, my_off_t start, my_off_t length,
         nxLockFlags = NX_RANGE_LOCK_EXCL;
       }
 
-      if (MyFlags & MY_DONT_WAIT)
+      if (MyFlags & MY_NO_WAIT)
       {
         /* Don't block on the lock. */
         nxLockFlags |= NX_RANGE_LOCK_TRYLOCK;
@@ -131,10 +131,16 @@ int my_lock(File fd, int locktype, my_off_t start, my_off_t length,
     lock.l_start=  (off_t) start;
     lock.l_len=    (off_t) length;
 
-    if (MyFlags & MY_DONT_WAIT)
+    if (MyFlags & (MY_NO_WAIT | MY_SHORT_WAIT))
     {
       if (fcntl(fd,F_SETLK,&lock) != -1)	/* Check if we can lock */
-	DBUG_RETURN(0);			/* Ok, file locked */
+	DBUG_RETURN(0);                         /* Ok, file locked */
+      if (MyFlags & MY_NO_WAIT)
+      {
+        my_errno= (errno == EACCES) ? EAGAIN : errno ? errno : -1;
+        DBUG_RETURN(-1);
+      }
+
       DBUG_PRINT("info",("Was locked, trying with alarm"));
       ALARM_INIT;
       while ((value=fcntl(fd,F_SETLKW,&lock)) && ! ALARM_TEST &&
diff --git a/mysys/my_malloc.c b/mysys/my_malloc.c
index 12793ad451b..12af5603a93 100644
--- a/mysys/my_malloc.c
+++ b/mysys/my_malloc.c
@@ -13,6 +13,9 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+/* my_global.h may define SAFEMALLOC (through my_config.h). */
+#include <my_global.h>
+
 #ifdef SAFEMALLOC			/* We don't need SAFEMALLOC here */
 #undef SAFEMALLOC
 #endif
diff --git a/mysys/my_once.c b/mysys/my_once.c
index b6f6656fce2..73bdd0166e6 100644
--- a/mysys/my_once.c
+++ b/mysys/my_once.c
@@ -15,6 +15,9 @@
 
 /* Not MT-SAFE */
 
+/* my_global.h may define SAFEMALLOC (through my_config.h). */
+#include <my_global.h>
+
 #ifdef SAFEMALLOC			/* We don't need SAFEMALLOC here */
 #undef SAFEMALLOC
 #endif
diff --git a/mysys/my_pread.c b/mysys/my_pread.c
index 3f62f150c91..836f5a92963 100644
--- a/mysys/my_pread.c
+++ b/mysys/my_pread.c
@@ -15,6 +15,8 @@
 
 #include "mysys_priv.h"
 #include "mysys_err.h"
+#include "my_base.h"
+#include <m_string.h>
 #include <errno.h>
 #ifdef HAVE_PREAD
 #include <unistd.h>
@@ -46,27 +48,37 @@ size_t my_pread(File Filedes, uchar *Buffer, size_t Count, my_off_t offset,
 {
   size_t readbytes;
   int error= 0;
+#ifndef HAVE_PREAD
+  int save_errno;
+#endif
+#ifndef DBUG_OFF
+  char llbuf[22];
   DBUG_ENTER("my_pread");
-  DBUG_PRINT("my",("Fd: %d  Seek: %lu  Buffer: 0x%lx  Count: %u  MyFlags: %d",
-		   Filedes, (ulong) offset, (long) Buffer, (uint) Count,
-                   MyFlags));
+  DBUG_PRINT("my",("fd: %d  Seek: %s  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
+		   Filedes, ullstr(offset, llbuf), (long) Buffer,
+                   (ulong)Count, MyFlags));
+#endif
   for (;;)
   {
-#ifndef __WIN__
-    errno=0;					/* Linux doesn't reset this */
-#endif
+    errno= 0;    /* Linux, Windows don't reset this on EOF/success */
 #ifndef HAVE_PREAD
     pthread_mutex_lock(&my_file_info[Filedes].mutex);
     readbytes= (uint) -1;
     error= (lseek(Filedes, offset, MY_SEEK_SET) == (my_off_t) -1 ||
 	    (readbytes= read(Filedes, Buffer, (uint) Count)) != Count);
+    save_errno= errno;
     pthread_mutex_unlock(&my_file_info[Filedes].mutex);
+    if (error)
+    {
+      errno= save_errno;
 #else
     if ((error= ((readbytes= pread(Filedes, Buffer, Count, offset)) != Count)))
-      my_errno= errno ? errno : -1;
-#endif
-    if (error || readbytes != Count)
     {
+#endif
+      my_errno= errno ? errno : -1;
+      if (errno == 0 || (readbytes != (size_t) -1 &&
+                         (MyFlags & (MY_NABP | MY_FNABP))))
+        my_errno= HA_ERR_FILE_TOO_SHORT;
       DBUG_PRINT("warning",("Read only %d bytes off %u from %d, errno: %d",
                             (int) readbytes, (uint) Count,Filedes,my_errno));
 #ifdef THREAD
@@ -122,10 +134,13 @@ size_t my_pwrite(int Filedes, const uchar *Buffer, size_t Count,
 {
   size_t writenbytes, written;
   uint errors;
+#ifndef DBUG_OFF
+  char llbuf[22];
   DBUG_ENTER("my_pwrite");
-  DBUG_PRINT("my",("Fd: %d  Seek: %lu  Buffer: 0x%lx  Count: %u  MyFlags: %d",
-		   Filedes, (ulong) offset, (long) Buffer, (uint) Count,
-                   MyFlags));
+  DBUG_PRINT("my",("fd: %d  Seek: %s  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
+		   Filedes, ullstr(offset, llbuf), (long) Buffer,
+                   (ulong)Count, MyFlags));
+#endif
   errors= 0;
   written= 0;
 
diff --git a/mysys/my_pthread.c b/mysys/my_pthread.c
index aba3e47d754..e97bbe89be0 100644
--- a/mysys/my_pthread.c
+++ b/mysys/my_pthread.c
@@ -429,7 +429,8 @@ int sigwait(sigset_t *setp, int *sigp)
 
 #include <netdb.h>
 
-int my_pthread_mutex_init(pthread_mutex_t *mp, const pthread_mutexattr_t *attr)
+int my_pthread_mutex_noposix_init(pthread_mutex_t *mp,
+                                  const pthread_mutexattr_t *attr)
 {
   int error;
   if (!attr)
@@ -439,7 +440,8 @@ int my_pthread_mutex_init(pthread_mutex_t *mp, const pthread_mutexattr_t *attr)
   return error;
 }
 
-int my_pthread_cond_init(pthread_cond_t *mp, const pthread_condattr_t *attr)
+int my_pthread_cond_noposix_init(pthread_cond_t *mp,
+                                 const pthread_condattr_t *attr)
 {
   int error;
   if (!attr)
diff --git a/mysys/my_read.c b/mysys/my_read.c
index 0c302d5b227..25ffe73d813 100644
--- a/mysys/my_read.c
+++ b/mysys/my_read.c
@@ -15,9 +15,9 @@
 
 #include "mysys_priv.h"
 #include "mysys_err.h"
+#include <my_base.h>
 #include <errno.h>
 
-
 /*
   Read a chunk of bytes from a file with retry's if needed
 
@@ -37,16 +37,19 @@ size_t my_read(File Filedes, uchar *Buffer, size_t Count, myf MyFlags)
 {
   size_t readbytes, save_count;
   DBUG_ENTER("my_read");
-  DBUG_PRINT("my",("Fd: %d  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
+  DBUG_PRINT("my",("fd: %d  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
                    Filedes, (long) Buffer, (ulong) Count, MyFlags));
   save_count= Count;
 
   for (;;)
   {
-    errno= 0;					/* Linux doesn't reset this */
+    errno= 0;					/* Linux, Windows don't reset this on EOF/success */
     if ((readbytes= read(Filedes, Buffer, (uint) Count)) != Count)
     {
-      my_errno= errno ? errno : -1;
+      my_errno= errno;
+      if (errno == 0 || (readbytes != (size_t) -1 &&
+                         (MyFlags & (MY_NABP | MY_FNABP))))
+        my_errno= HA_ERR_FILE_TOO_SHORT;
       DBUG_PRINT("warning",("Read only %d bytes off %lu from %d, errno: %d",
                             (int) readbytes, (ulong) Count, Filedes,
                             my_errno));
diff --git a/mysys/my_realloc.c b/mysys/my_realloc.c
index a55282e03a0..7e49a482884 100644
--- a/mysys/my_realloc.c
+++ b/mysys/my_realloc.c
@@ -13,6 +13,9 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
+/* my_global.h may define SAFEMALLOC (through my_config.h). */
+#include <my_global.h>
+
 #ifdef SAFEMALLOC			/* We don't need SAFEMALLOC here */
 #undef SAFEMALLOC
 #endif
@@ -32,6 +35,7 @@
    @note if size==0 realloc() may return NULL; my_realloc() treats this as an
    error which is not the intention of realloc()
 */
+
 void* my_realloc(void* oldpoint, size_t size, myf my_flags)
 {
   void *point;
diff --git a/mysys/my_rnd.c b/mysys/my_rnd.c
new file mode 100644
index 00000000000..b7dca0f2afd
--- /dev/null
+++ b/mysys/my_rnd.c
@@ -0,0 +1,55 @@
+/* Copyright (C) 2007 MySQL AB & Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "mysys_priv.h"
+#include <m_string.h>
+
+/*
+  Initialize random generator
+
+  NOTES
+    MySQL's password checks depends on this, so don't do any changes
+    that changes the random numbers that are generated!
+*/
+
+void my_rnd_init(struct my_rnd_struct *rand_st, ulong seed1, ulong seed2)
+{
+#ifdef HAVE_purify
+  bzero((char*) rand_st,sizeof(*rand_st));      /* Avoid UMC varnings */
+#endif
+  rand_st->max_value= 0x3FFFFFFFL;
+  rand_st->max_value_dbl=(double) rand_st->max_value;
+  rand_st->seed1=seed1%rand_st->max_value ;
+  rand_st->seed2=seed2%rand_st->max_value;
+}
+
+
+/*
+  Generate random number.
+
+  SYNOPSIS
+    my_rnd()
+    rand_st    INOUT  Structure used for number generation
+    
+  RETURN VALUE
+    generated pseudo random number
+*/
+
+double my_rnd(struct my_rnd_struct *rand_st)
+{
+  rand_st->seed1=(rand_st->seed1*3+rand_st->seed2) % rand_st->max_value;
+  rand_st->seed2=(rand_st->seed1+rand_st->seed2+33) % rand_st->max_value;
+  return (((double) rand_st->seed1)/rand_st->max_value_dbl);
+}
diff --git a/mysys/my_safehash.c b/mysys/my_safehash.c
new file mode 100644
index 00000000000..b3d6439793c
--- /dev/null
+++ b/mysys/my_safehash.c
@@ -0,0 +1,297 @@
+/* Copyright (C) 2003-2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Handling of multiple key caches
+
+  The idea is to have a thread safe hash on the table name,
+  with a default key cache value that is returned if the table name is not in
+  the cache.
+*/
+
+#include "mysys_priv.h"
+#include <m_string.h>
+#include "my_safehash.h"
+
+/*****************************************************************************
+  General functions to handle SAFE_HASH objects.
+
+  A SAFE_HASH object is used to store the hash, the mutex and default value
+  needed by the rest of the key cache code.
+  This is a separate struct to make it easy to later reuse the code for other
+  purposes
+
+  All entries are linked in a list to allow us to traverse all elements
+  and delete selected ones. (HASH doesn't allow any easy ways to do this).
+*****************************************************************************/
+
+
+/*
+  Free a SAFE_HASH_ENTRY
+
+  SYNOPSIS
+    safe_hash_entry_free()
+    entry                The entry which should be freed
+
+  NOTE
+    This function is called by the hash object on delete
+*/
+
+static void safe_hash_entry_free(SAFE_HASH_ENTRY *entry)
+{
+  DBUG_ENTER("safe_hash_entry_free");
+  my_free((uchar*) entry, MYF(0));
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Get key and length for a SAFE_HASH_ENTRY
+
+  SYNOPSIS
+    safe_hash_entry_get()
+    entry                The entry for which the key should be returned
+    length               Length of the key
+
+  RETURN
+    #  reference on the key
+*/
+
+static uchar *safe_hash_entry_get(SAFE_HASH_ENTRY *entry, size_t *length,
+                                  my_bool not_used __attribute__((unused)))
+{
+  *length= entry->length;
+  return (uchar*) entry->key;
+}
+
+
+/*
+  Init a SAFE_HASH object
+
+  SYNOPSIS
+    safe_hash_init()
+    hash		safe_hash handler
+    elements		Expected max number of elements
+    default_value	default value
+
+  NOTES
+    In case of error we set hash->default_value to 0 to allow one to call
+    safe_hash_free on an object that couldn't be initialized.
+
+  RETURN
+    0  OK
+    1  error
+*/
+
+my_bool safe_hash_init(SAFE_HASH *hash, uint elements,
+                       uchar *default_value)
+{
+  DBUG_ENTER("safe_hash_init");
+  if (hash_init(&hash->hash, &my_charset_bin, elements,
+                0, 0, (hash_get_key) safe_hash_entry_get,
+                (void (*)(void*)) safe_hash_entry_free, 0))
+  {
+    hash->default_value= 0;
+    DBUG_RETURN(1);
+  }
+  my_rwlock_init(&hash->mutex, 0);
+  hash->default_value= default_value;
+  hash->root= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Free a SAFE_HASH object
+
+  SYNOPSIS
+    safe_hash_free()
+    hash                 Hash handle
+
+  NOTES
+    This is safe to call on any object that has been sent to safe_hash_init()
+*/
+
+void safe_hash_free(SAFE_HASH *hash)
+{
+  /*
+    Test if safe_hash_init succeeded. This will also guard us against multiple
+    free calls.
+  */
+  if (hash->default_value)
+  {
+    hash_free(&hash->hash);
+    rwlock_destroy(&hash->mutex);
+    hash->default_value=0;
+  }
+}
+
+
+/*
+  Return the value stored for a key or default value if no key
+
+  SYNOPSIS
+    safe_hash_search()
+    hash                 Hash handle
+    key                  key (path to table etc..)
+    length               Length of key
+    def                  Default value of data
+
+  RETURN
+    #  data associated with the key of default value if data was not found
+*/
+
+uchar *safe_hash_search(SAFE_HASH *hash, const uchar *key, uint length,
+                        uchar *def)
+{
+  uchar *result;
+  DBUG_ENTER("safe_hash_search");
+  rw_rdlock(&hash->mutex);
+  result= hash_search(&hash->hash, key, length);
+  rw_unlock(&hash->mutex);
+  if (!result)
+    result= def;
+  else
+    result= ((SAFE_HASH_ENTRY*) result)->data;
+  DBUG_PRINT("exit",("data: 0x%lx", (long) result));
+  DBUG_RETURN(result);
+}
+
+
+/*
+  Associate a key with some data
+
+  SYNOPSIS
+    safe_hash_set()
+    hash                 Hash handle
+    key                  key (path to table etc..)
+    length               Length of key
+    data                 data to to associate with the data
+
+  NOTES
+    This can be used both to insert a new entry and change an existing
+    entry.
+    If one associates a key with the default key cache, the key is deleted
+
+  RETURN
+    0  OK
+    1  error (Can only be EOM). In this case my_message() is called.
+*/
+
+my_bool safe_hash_set(SAFE_HASH *hash, const uchar *key, uint length,
+                      uchar *data)
+{
+  SAFE_HASH_ENTRY *entry;
+  my_bool error= 0;
+  DBUG_ENTER("safe_hash_set");
+  DBUG_PRINT("enter",("key: %.*s  data: 0x%lx", length, key, (long) data));
+
+  rw_wrlock(&hash->mutex);
+  entry= (SAFE_HASH_ENTRY*) hash_search(&hash->hash, key, length);
+
+  if (data == hash->default_value)
+  {
+    /*
+      The key is to be associated with the default entry. In this case
+      we can just delete the entry (if it existed) from the hash as a
+      search will return the default entry
+    */
+    if (!entry)          /* nothing to do */
+      goto end;
+    /* unlink entry from list */
+    if ((*entry->prev= entry->next))
+      entry->next->prev= entry->prev;
+    hash_delete(&hash->hash, (uchar*) entry);
+    goto end;
+  }
+  if (entry)
+  {
+    /* Entry existed;  Just change the pointer to point at the new data */
+    entry->data= data;
+  }
+  else
+  {
+    if (!(entry= (SAFE_HASH_ENTRY *) my_malloc(sizeof(*entry) + length,
+                                               MYF(MY_WME))))
+    {
+      error= 1;
+      goto end;
+    }
+    entry->key= (uchar*) (entry +1);
+    memcpy((char*) entry->key, (char*) key, length);
+    entry->length= length;
+    entry->data= data;
+    /* Link entry to list */
+    if ((entry->next= hash->root))
+      entry->next->prev= &entry->next;
+    entry->prev= &hash->root;
+    hash->root= entry;
+    if (my_hash_insert(&hash->hash, (uchar*) entry))
+    {
+      /* This can only happen if hash got out of memory */
+      my_free((char*) entry, MYF(0));
+      error= 1;
+      goto end;
+    }
+  }
+
+end:
+  rw_unlock(&hash->mutex);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Change all entries with one data value to another data value
+
+  SYNOPSIS
+    safe_hash_change()
+    hash                 Hash handle
+    old_data             Old data
+    new_data             Change all 'old_data' to this
+
+  NOTES
+    We use the linked list to traverse all elements in the hash as
+    this allows us to delete elements in the case where 'new_data' is the
+    default value.
+*/
+
+void safe_hash_change(SAFE_HASH *hash, uchar *old_data, uchar *new_data)
+{
+  SAFE_HASH_ENTRY *entry, *next;
+  DBUG_ENTER("safe_hash_change");
+
+  rw_wrlock(&hash->mutex);
+
+  for (entry= hash->root ; entry ; entry= next)
+  {
+    next= entry->next;
+    if (entry->data == old_data)
+    {
+      if (new_data == hash->default_value)
+      {
+        if ((*entry->prev= entry->next))
+          entry->next->prev= entry->prev;
+        hash_delete(&hash->hash, (uchar*) entry);
+      }
+      else
+        entry->data= new_data;
+    }
+  }
+
+  rw_unlock(&hash->mutex);
+  DBUG_VOID_RETURN;
+}
diff --git a/mysys/my_safehash.h b/mysys/my_safehash.h
new file mode 100644
index 00000000000..8a5856b6763
--- /dev/null
+++ b/mysys/my_safehash.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2003 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Handling of multiple key caches
+
+  The idea is to have a thread safe hash on the table name,
+  with a default key cache value that is returned if the table name is not in
+  the cache.
+*/
+
+#include <hash.h>
+
+/*
+  Struct to store a key and pointer to object
+*/
+
+typedef struct st_safe_hash_entry
+{
+  uchar *key;
+  uint length;
+  uchar *data;
+  struct st_safe_hash_entry *next, **prev;
+} SAFE_HASH_ENTRY;
+
+
+typedef struct st_safe_hash_with_default
+{
+#ifdef THREAD
+  rw_lock_t mutex;
+#endif
+  HASH hash;
+  uchar *default_value;
+  SAFE_HASH_ENTRY *root;
+} SAFE_HASH;
+
+
+my_bool safe_hash_init(SAFE_HASH *hash, uint elements,
+                       uchar *default_value);
+void safe_hash_free(SAFE_HASH *hash);
+uchar *safe_hash_search(SAFE_HASH *hash, const uchar *key, uint length,
+                       uchar *def);
+my_bool safe_hash_set(SAFE_HASH *hash, const uchar *key, uint length,
+                      uchar *data);
+void safe_hash_change(SAFE_HASH *hash, uchar *old_data, uchar *new_data);
diff --git a/mysys/my_seek.c b/mysys/my_seek.c
index 2c661baeff7..4e18b510a1e 100644
--- a/mysys/my_seek.c
+++ b/mysys/my_seek.c
@@ -47,7 +47,7 @@ my_off_t my_seek(File fd, my_off_t pos, int whence,
 {
   reg1 os_off_t newpos= -1;
   DBUG_ENTER("my_seek");
-  DBUG_PRINT("my",("Fd: %d  Hpos: %lu  Pos: %lu  Whence: %d  MyFlags: %d",
+  DBUG_PRINT("my",("fd: %d  Hpos: %lu  Pos: %lu  Whence: %d  MyFlags: %d",
 		   fd, (ulong) (((ulonglong) pos) >> 32), (ulong) pos, 
 		   whence, MyFlags));
   DBUG_ASSERT(pos != MY_FILEPOS_ERROR);		/* safety check */
@@ -87,7 +87,7 @@ my_off_t my_tell(File fd, myf MyFlags __attribute__((unused)))
 {
   os_off_t pos;
   DBUG_ENTER("my_tell");
-  DBUG_PRINT("my",("Fd: %d  MyFlags: %d",fd, MyFlags));
+  DBUG_PRINT("my",("fd: %d  MyFlags: %d",fd, MyFlags));
   DBUG_ASSERT(fd >= 0);
 #ifdef HAVE_TELL
   pos=tell(fd);
diff --git a/mysys/my_sleep.c b/mysys/my_sleep.c
index 87170e4af41..cb21c15a925 100644
--- a/mysys/my_sleep.c
+++ b/mysys/my_sleep.c
@@ -30,7 +30,7 @@ void my_sleep(ulong m_seconds)
   t.tv_usec= m_seconds % 1000000L;
   select(0,0,0,0,&t); /* sleep */
 #else
-  uint sec=    (uint) (m_seconds / 1000000L);
+  uint sec=    (uint) ((m_seconds + 999999L) / 1000000L);
   ulong start= (ulong) time((time_t*) 0);
   while ((ulong) time((time_t*) 0) < start+sec);
 #endif
diff --git a/mysys/my_static.c b/mysys/my_static.c
index d0c20da828a..c33d05420c9 100644
--- a/mysys/my_static.c
+++ b/mysys/my_static.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,7 +26,7 @@ my_bool timed_mutexes= 0;
 
 	/* from my_init */
 char *	home_dir=0;
-const char      *my_progname=0;
+const char      *my_progname= NULL, *my_progname_short= NULL;
 char		NEAR curr_dir[FN_REFLEN]= {0},
 		NEAR home_dir_buff[FN_REFLEN]= {0};
 ulong		my_stream_opened=0,my_file_opened=0, my_tmp_file_created=0;
@@ -92,6 +92,19 @@ int (*error_handler_hook)(uint error,const char *str,myf MyFlags)=
 int (*fatal_error_handler_hook)(uint error,const char *str,myf MyFlags)=
   my_message_no_curses;
 
+static const char *proc_info_dummy(void *a __attribute__((unused)),
+                                   const char *b __attribute__((unused)),
+                                   const char *c __attribute__((unused)),
+                                   const char *d __attribute__((unused)),
+                                   const unsigned int e __attribute__((unused)))
+{
+  return 0;
+}
+
+/* this is to be able to call set_thd_proc_info from the C code */
+const char *(*proc_info_hook)(void *, const char *, const char *, const char *,
+                              const unsigned int)= proc_info_dummy;
+
 #ifdef __WIN__
 /* from my_getsystime.c */
 ulonglong query_performance_frequency, query_performance_offset;
diff --git a/mysys/my_sync.c b/mysys/my_sync.c
index ba6964b00d6..1b8420c034e 100644
--- a/mysys/my_sync.c
+++ b/mysys/my_sync.c
@@ -44,7 +44,7 @@ int my_sync(File fd, myf my_flags)
 {
   int res;
   DBUG_ENTER("my_sync");
-  DBUG_PRINT("my",("Fd: %d  my_flags: %d", fd, my_flags));
+  DBUG_PRINT("my",("fd: %d  my_flags: %d", fd, my_flags));
 
   do
   {
diff --git a/mysys/my_thr_init.c b/mysys/my_thr_init.c
index 2278c467f32..716552bc31b 100644
--- a/mysys/my_thr_init.c
+++ b/mysys/my_thr_init.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -40,12 +40,6 @@ pthread_mutex_t LOCK_localtime_r;
 #ifndef HAVE_GETHOSTBYNAME_R
 pthread_mutex_t LOCK_gethostbyname_r;
 #endif
-#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
-pthread_mutexattr_t my_fast_mutexattr;
-#endif
-#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
-pthread_mutexattr_t my_errorcheck_mutexattr;
-#endif
 
 #ifdef TARGET_OS_LINUX
 
@@ -65,6 +59,38 @@ nptl_pthread_exit_hack_handler(void *arg __attribute((unused)))
 #endif /* TARGET_OS_LINUX */
 
 
+
+/**
+  Initialize thread attributes.
+*/
+
+void my_threadattr_global_init(void)
+{
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  /*
+    Set mutex type to "fast" a.k.a "adaptive"
+
+    In this case the thread may steal the mutex from some other thread
+    that is waiting for the same mutex.  This will save us some
+    context switches but may cause a thread to 'starve forever' while
+    waiting for the mutex (not likely if the code within the mutex is
+    short).
+  */
+  pthread_mutexattr_init(&my_fast_mutexattr);  /* ?= MY_MUTEX_INIT_FAST */
+  pthread_mutexattr_settype(&my_fast_mutexattr,
+                            PTHREAD_MUTEX_ADAPTIVE_NP);
+#endif
+#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
+  /*
+    Set mutex type to "errorcheck"
+  */
+  pthread_mutexattr_init(&my_errorcheck_mutexattr);
+  pthread_mutexattr_settype(&my_errorcheck_mutexattr,
+                            PTHREAD_MUTEX_ERRORCHECK);
+#endif
+}
+
+
 static uint get_thread_lib(void);
 
 /*
@@ -115,30 +141,16 @@ my_bool my_thread_global_init(void)
   }
 #endif /* TARGET_OS_LINUX */
 
-#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
-  /*
-    Set mutex type to "fast" a.k.a "adaptive"
+  /* Mutex used by my_thread_init() and after my_thread_destroy_mutex() */
+  my_pthread_mutex_init(&THR_LOCK_threads, MY_MUTEX_INIT_FAST, 
+                        "THR_LOCK_threads", MYF_NO_DEADLOCK_DETECTION);
+  my_pthread_mutex_init(&THR_LOCK_malloc, MY_MUTEX_INIT_FAST,
+                        "THR_LOCK_malloc", MYF_NO_DEADLOCK_DETECTION);
 
-    In this case the thread may steal the mutex from some other thread
-    that is waiting for the same mutex.  This will save us some
-    context switches but may cause a thread to 'starve forever' while
-    waiting for the mutex (not likely if the code within the mutex is
-    short).
-  */
-  pthread_mutexattr_init(&my_fast_mutexattr);
-  pthread_mutexattr_settype(&my_fast_mutexattr,
-                            PTHREAD_MUTEX_ADAPTIVE_NP);
-#endif
-#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
-  /*
-    Set mutex type to "errorcheck"
-  */
-  pthread_mutexattr_init(&my_errorcheck_mutexattr);
-  pthread_mutexattr_settype(&my_errorcheck_mutexattr,
-                            PTHREAD_MUTEX_ERRORCHECK);
-#endif
+  if (my_thread_init())
+    return 1;
 
-  pthread_mutex_init(&THR_LOCK_malloc,MY_MUTEX_INIT_FAST);
+  /* Mutex uses by mysys */
   pthread_mutex_init(&THR_LOCK_open,MY_MUTEX_INIT_FAST);
   pthread_mutex_init(&THR_LOCK_lock,MY_MUTEX_INIT_FAST);
   pthread_mutex_init(&THR_LOCK_isam,MY_MUTEX_INIT_SLOW);
@@ -146,7 +158,6 @@ my_bool my_thread_global_init(void)
   pthread_mutex_init(&THR_LOCK_heap,MY_MUTEX_INIT_FAST);
   pthread_mutex_init(&THR_LOCK_net,MY_MUTEX_INIT_FAST);
   pthread_mutex_init(&THR_LOCK_charset,MY_MUTEX_INIT_FAST);
-  pthread_mutex_init(&THR_LOCK_threads,MY_MUTEX_INIT_FAST);
   pthread_mutex_init(&THR_LOCK_time,MY_MUTEX_INIT_FAST);
   pthread_cond_init(&THR_COND_threads, NULL);
 #if defined( __WIN__) || defined(OS2)
@@ -158,53 +169,85 @@ my_bool my_thread_global_init(void)
 #ifndef HAVE_GETHOSTBYNAME_R
   pthread_mutex_init(&LOCK_gethostbyname_r,MY_MUTEX_INIT_SLOW);
 #endif
-  if (my_thread_init())
-  {
-    my_thread_global_end();			/* Clean up */
-    return 1;
-  }
   return 0;
 }
 
 
-void my_thread_global_end(void)
+/**
+   Wait for all threads in system to die
+   @fn    my_wait_for_other_threads_to_die()
+   @param number_of_threads  Wait until this number of threads
+
+   @retval  0  Less or equal to number_of_threads left
+   @retval  1  Wait failed
+*/
+
+my_bool my_wait_for_other_threads_to_die(uint number_of_threads)
 {
   struct timespec abstime;
   my_bool all_threads_killed= 1;
 
   set_timespec(abstime, my_thread_end_wait_time);
   pthread_mutex_lock(&THR_LOCK_threads);
-  while (THR_thread_count > 0)
+  while (THR_thread_count > number_of_threads)
   {
     int error= pthread_cond_timedwait(&THR_COND_threads, &THR_LOCK_threads,
                                       &abstime);
     if (error == ETIMEDOUT || error == ETIME)
     {
-#ifdef HAVE_PTHREAD_KILL
-      /*
-        We shouldn't give an error here, because if we don't have
-        pthread_kill(), programs like mysqld can't ensure that all threads
-        are killed when we enter here.
-      */
-      if (THR_thread_count)
-        fprintf(stderr,
-                "Error in my_thread_global_end(): %d threads didn't exit\n",
-                THR_thread_count);
-#endif
       all_threads_killed= 0;
       break;
     }
   }
   pthread_mutex_unlock(&THR_LOCK_threads);
+  return all_threads_killed;
+}
 
-  pthread_key_delete(THR_KEY_mysys);
-#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
-  pthread_mutexattr_destroy(&my_fast_mutexattr);
-#endif
-#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
-  pthread_mutexattr_destroy(&my_errorcheck_mutexattr);
+
+/**
+   End the mysys thread system. Called when ending the last thread
+*/
+
+
+void my_thread_global_end(void)
+{
+  my_bool all_threads_killed;
+
+  if (!(all_threads_killed= my_wait_for_other_threads_to_die(0)))
+  {
+#ifdef HAVE_PTHREAD_KILL
+    /*
+      We shouldn't give an error here, because if we don't have
+      pthread_kill(), programs like mysqld can't ensure that all threads
+      are killed when we enter here.
+    */
+    if (THR_thread_count)
+      fprintf(stderr,
+              "Error in my_thread_global_end(): %d threads didn't exit\n",
+              THR_thread_count);
 #endif
-  pthread_mutex_destroy(&THR_LOCK_malloc);
+  }
+
+  pthread_key_delete(THR_KEY_mysys);
+  if (all_threads_killed)
+  {
+    pthread_mutex_destroy(&THR_LOCK_threads);
+    pthread_cond_destroy(&THR_COND_threads);
+    pthread_mutex_destroy(&THR_LOCK_malloc);
+  }
+}
+
+/* Free all mutex used by mysys */
+
+void my_thread_destroy_mutex(void)
+{
+  struct st_my_thread_var *tmp;
+  tmp= my_pthread_getspecific(struct st_my_thread_var*,THR_KEY_mysys);
+  if (tmp)
+  {
+    safe_mutex_free_deadlock_data(&tmp->mutex);
+  }  
+
   pthread_mutex_destroy(&THR_LOCK_open);
   pthread_mutex_destroy(&THR_LOCK_lock);
   pthread_mutex_destroy(&THR_LOCK_isam);
@@ -213,11 +256,6 @@ void my_thread_global_end(void)
   pthread_mutex_destroy(&THR_LOCK_net);
   pthread_mutex_destroy(&THR_LOCK_time);
   pthread_mutex_destroy(&THR_LOCK_charset);
-  if (all_threads_killed)
-  {
-    pthread_mutex_destroy(&THR_LOCK_threads);
-    pthread_cond_destroy(&THR_COND_threads);
-  }
 #if !defined(HAVE_LOCALTIME_R) || !defined(HAVE_GMTIME_R)
   pthread_mutex_destroy(&LOCK_localtime_r);
 #endif
@@ -256,7 +294,7 @@ my_bool my_thread_init(void)
 #ifdef EXTRA_DEBUG_THREADS
   fprintf(stderr,"my_thread_init(): thread_id: 0x%lx\n",
           (ulong) pthread_self());
-#endif  
+#endif
 
 #if !defined(__WIN__) || defined(USE_TLS)
   if (my_pthread_getspecific(struct st_my_thread_var *,THR_KEY_mysys))
@@ -264,7 +302,7 @@ my_bool my_thread_init(void)
 #ifdef EXTRA_DEBUG_THREADS
     fprintf(stderr,"my_thread_init() called more than once in thread 0x%lx\n",
             (long) pthread_self());
-#endif    
+#endif
     goto end;
   }
   if (!(tmp= (struct st_my_thread_var *) calloc(1, sizeof(*tmp))))
@@ -287,14 +325,18 @@ my_bool my_thread_init(void)
 #else
   tmp->pthread_self= pthread_self();
 #endif
-  pthread_mutex_init(&tmp->mutex,MY_MUTEX_INIT_FAST);
+  my_pthread_mutex_init(&tmp->mutex, MY_MUTEX_INIT_FAST, "mysys_var->mutex",
+                        0);
   pthread_cond_init(&tmp->suspend, NULL);
-  tmp->init= 1;
+
+  tmp->stack_ends_here= (char*)&tmp +
+                         STACK_DIRECTION * (long)my_thread_stack_size;
 
   pthread_mutex_lock(&THR_LOCK_threads);
   tmp->id= ++thread_id;
   ++THR_thread_count;
   pthread_mutex_unlock(&THR_LOCK_threads);
+  tmp->init= 1;
 #ifndef DBUG_OFF
   /* Generate unique name for thread */
   (void) my_thread_name();
@@ -325,9 +367,16 @@ void my_thread_end(void)
 #ifdef EXTRA_DEBUG_THREADS
   fprintf(stderr,"my_thread_end(): tmp: 0x%lx  pthread_self: 0x%lx  thread_id: %ld\n",
 	  (long) tmp, (long) pthread_self(), tmp ? (long) tmp->id : 0L);
-#endif  
+#endif
   if (tmp && tmp->init)
   {
+
+#if !defined(__bsdi__) && !defined(__OpenBSD__)
+ /* bsdi and openbsd 3.5 dumps core here */
+    pthread_cond_destroy(&tmp->suspend);
+#endif
+    pthread_mutex_destroy(&tmp->mutex);
+
 #if !defined(DBUG_OFF)
     /* tmp->dbug is allocated inside DBUG library */
     if (tmp->dbug)
@@ -337,17 +386,19 @@ void my_thread_end(void)
       tmp->dbug=0;
     }
 #endif
-#if !defined(__bsdi__) && !defined(__OpenBSD__)
- /* bsdi and openbsd 3.5 dumps core here */
-    pthread_cond_destroy(&tmp->suspend);
-#endif
-    pthread_mutex_destroy(&tmp->mutex);
 #if !defined(__WIN__) || defined(USE_TLS)
+#ifndef DBUG_OFF
+    /* To find bugs when accessing unallocated data */
+    bfill(tmp, sizeof(tmp), 0x8F);
+#endif
     free(tmp);
 #else
     tmp->init= 0;
 #endif
 
+#if !defined(__WIN__) || defined(USE_TLS)
+    pthread_setspecific(THR_KEY_mysys,0);
+#endif
     /*
       Decrement counter for number of running threads. We are using this
       in my_thread_global_end() to wait until all threads have called
@@ -360,10 +411,12 @@ void my_thread_end(void)
       pthread_cond_signal(&THR_COND_threads);
    pthread_mutex_unlock(&THR_LOCK_threads);
   }
-  /* The following free has to be done, even if my_thread_var() is 0 */
+  else
+  {
 #if !defined(__WIN__) || defined(USE_TLS)
-  pthread_setspecific(THR_KEY_mysys,0);
+    pthread_setspecific(THR_KEY_mysys,0);
 #endif
+  }
 }
 
 struct st_my_thread_var *_my_thread_var(void)
@@ -371,6 +424,25 @@ struct st_my_thread_var *_my_thread_var(void)
   return  my_pthread_getspecific(struct st_my_thread_var*,THR_KEY_mysys);
 }
 
+#ifndef DBUG_OFF
+/* Return pointer to DBUG for holding current state */
+
+extern void **my_thread_var_dbug()
+{
+  struct st_my_thread_var *tmp=
+    my_pthread_getspecific(struct st_my_thread_var*,THR_KEY_mysys);
+  return tmp && tmp->init ? &tmp->dbug : 0;
+}
+#endif
+
+/* Return pointer to mutex_in_use */
+
+safe_mutex_t **my_thread_var_mutex_in_use()
+{
+  struct st_my_thread_var *tmp=
+    my_pthread_getspecific(struct st_my_thread_var*,THR_KEY_mysys);
+  return tmp ? &tmp->mutex_in_use : 0;
+}
 
 /****************************************************************************
   Get name of current thread.
diff --git a/mysys/my_uuid.c b/mysys/my_uuid.c
new file mode 100644
index 00000000000..d1e8331aaa1
--- /dev/null
+++ b/mysys/my_uuid.c
@@ -0,0 +1,243 @@
+/* Copyright (C) 2007 MySQL AB, Sergei Golubchik & Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  implements Universal Unique Identifiers (UUIDs), as in
+    DCE 1.1: Remote Procedure Call,
+    Open Group Technical Standard Document Number C706, October 1997,
+    (supersedes C309 DCE: Remote Procedure Call 8/1994,
+    which was basis for ISO/IEC 11578:1996 specification)
+
+  A UUID has the following structure:
+
+  Field                     NDR Data Type  Octet #          Note
+ time_low                   unsigned long    0-3     The low field of the
+                                                     timestamp.
+ time_mid                   unsigned short   4-5     The middle field of
+                                                     the timestamp.
+ time_hi_and_version        unsigned short   6-7     The high field of the
+                                                     timestamp multiplexed
+                                                     with the version number.
+ clock_seq_hi_and_reserved  unsigned small   8       The high field of the
+                                                     clock sequence multi-
+                                                     plexed with the variant.
+ clock_seq_low              unsigned small   9       The low field of the
+                                                     clock sequence.
+ node                       character        10-15   The spatially unique node
+                                                     identifier.
+*/
+
+#include "mysys_priv.h"
+#include <m_string.h>
+#include <myisampack.h> /* mi_int2store, mi_int4store */
+
+static my_bool my_uuid_inited= 0;
+static struct my_rnd_struct uuid_rand;
+static uint nanoseq;
+static ulonglong uuid_time= 0;
+static uchar uuid_suffix[2+6]; /* clock_seq and node */
+
+#ifdef THREAD
+pthread_mutex_t LOCK_uuid_generator;
+#endif
+
+/*
+  Number of 100-nanosecond intervals between
+  1582-10-15 00:00:00.00 and 1970-01-01 00:00:00.00
+*/
+
+#define UUID_TIME_OFFSET ((ulonglong) 141427 * 24 * 60 * 60 * \
+                          1000 * 1000 * 10)
+#define UUID_VERSION      0x1000
+#define UUID_VARIANT      0x8000
+
+
+/* Helper function */
+
+static void set_clock_seq()
+{
+  uint16 clock_seq= ((uint)(my_rnd(&uuid_rand)*16383)) | UUID_VARIANT;
+  mi_int2store(uuid_suffix, clock_seq);
+}
+
+
+/**
+  Init structures needed for my_uuid
+
+  @func my_uuid_init()
+  @param seed1		Seed for random generator
+  @param seed2		Seed for random generator
+
+  @note
+    Seed1 & seed2 should NOT depend on clock. This is to be able to
+    generate a random mac address according to UUID specs.
+*/
+
+void my_uuid_init(ulong seed1, ulong seed2)
+{
+  uchar *mac= uuid_suffix+2;
+  ulonglong now;
+
+  if (my_uuid_inited)
+    return;
+  my_uuid_inited= 1;
+  now= my_getsystime();
+  nanoseq= 0;
+
+  if (my_gethwaddr(mac))
+  {
+    uint i;
+    /*
+      Generating random "hardware addr"
+
+      Specs explicitly specify that node identifier should NOT
+      correlate with a clock_seq value, so we use a separate
+      randominit() here.
+    */
+    /* purecov: begin inspected */
+    my_rnd_init(&uuid_rand, (ulong) (seed2+ now/2), (ulong) (now+rand()));
+    for (i=0; i < sizeof(mac); i++)
+      mac[i]= (uchar)(my_rnd(&uuid_rand)*255);
+    /* purecov: end */
+  }
+  my_rnd_init(&uuid_rand, (ulong) (seed1 + now), (ulong) (now/2+ getpid()));
+  set_clock_seq();
+  pthread_mutex_init(&LOCK_uuid_generator, MY_MUTEX_INIT_FAST);
+}
+
+
+/**
+   Create a global unique identifier (uuid)
+
+   @func  my_uuid()
+   @param to   Store uuid here. Must be of size MY_uuid_SIZE (16)
+*/
+
+void my_uuid(uchar *to)
+{
+  ulonglong tv;
+  uint32 time_low;
+  uint16 time_mid, time_hi_and_version;
+
+  DBUG_ASSERT(my_uuid_inited);
+
+  pthread_mutex_lock(&LOCK_uuid_generator);
+  tv= my_getsystime() + UUID_TIME_OFFSET + nanoseq;
+
+  if (likely(tv > uuid_time))
+  {
+    /*
+      Current time is ahead of last timestamp, as it should be.
+      If we "borrowed time", give it back, just as long as we
+      stay ahead of the previous timestamp.
+    */
+    if (nanoseq)
+    {
+      ulong delta;
+      DBUG_ASSERT((tv > uuid_time) && (nanoseq > 0));
+      /*
+        -1 so we won't make tv= uuid_time for nanoseq >= (tv - uuid_time)
+      */
+      delta= min(nanoseq, (ulong)(tv - uuid_time -1));
+      tv-= delta;
+      nanoseq-= delta;
+    }
+  }
+  else
+  {
+    if (unlikely(tv == uuid_time))
+    {
+      /*
+        For low-res system clocks. If several requests for UUIDs
+        end up on the same tick, we add a nano-second to make them
+        different.
+        ( current_timestamp + nanoseq * calls_in_this_period )
+        may end up > next_timestamp; this is OK. Nonetheless, we'll
+        try to unwind nanoseq when we get a chance to.
+        If nanoseq overflows, we'll start over with a new numberspace
+        (so the if() below is needed so we can avoid the ++tv and thus
+        match the follow-up if() if nanoseq overflows!).
+      */
+      if (likely(++nanoseq))
+        ++tv;
+    }
+
+    if (unlikely(tv <= uuid_time))
+    {
+      /*
+        If the admin changes the system clock (or due to Daylight
+        Saving Time), the system clock may be turned *back* so we
+        go through a period once more for which we already gave out
+        UUIDs.  To avoid duplicate UUIDs despite potentially identical
+        times, we make a new random component.
+        We also come here if the nanoseq "borrowing" overflows.
+        In either case, we throw away any nanoseq borrowing since it's
+        irrelevant in the new numberspace.
+      */
+      set_clock_seq();
+      tv= my_getsystime() + UUID_TIME_OFFSET;
+      nanoseq= 0;
+      DBUG_PRINT("uuid",("making new numberspace"));
+    }
+  }
+
+  uuid_time=tv;
+  pthread_mutex_unlock(&LOCK_uuid_generator);
+
+  time_low=            (uint32) (tv & 0xFFFFFFFF);
+  time_mid=            (uint16) ((tv >> 32) & 0xFFFF);
+  time_hi_and_version= (uint16) ((tv >> 48) | UUID_VERSION);
+
+  /*
+    Note, that the standard does NOT specify byte ordering in
+    multi-byte fields. it's implementation defined (but must be
+    the same for all fields).
+    We use big-endian, so we can use memcmp() to compare UUIDs
+    and for straightforward UUID to string conversion.
+  */
+  mi_int4store(to, time_low);
+  mi_int2store(to+4, time_mid);
+  mi_int2store(to+6, time_hi_and_version);
+  bmove(to+8, uuid_suffix, sizeof(uuid_suffix));
+}
+
+
+/**
+   Convert uuid to string representation
+
+   @func  my_uuid2str()
+   @param guid uuid
+   @param s    Output buffer.Must be at least MY_UUID_STRING_LENGTH+1 large.
+*/
+void my_uuid2str(const uchar *guid, char *s)
+{
+  int i;
+  for (i=0; i < MY_UUID_SIZE; i++)
+  {
+    *s++= _dig_vec_lower[guid[i] >>4];
+    *s++= _dig_vec_lower[guid[i] & 15];
+    if(i == 3 || i == 5 || i == 7 || i == 9)
+      *s++= '-';
+  }
+}
+
+void my_uuid_end()
+{
+  if (my_uuid_inited)
+  {
+    my_uuid_inited= 0;
+    pthread_mutex_destroy(&LOCK_uuid_generator);
+  }
+}
diff --git a/mysys/my_wincond.c b/mysys/my_wincond.c
index d1b07b61408..8b548a64079 100644
--- a/mysys/my_wincond.c
+++ b/mysys/my_wincond.c
@@ -17,6 +17,7 @@
 ** The following is a simple implementation of posix conditions
 *****************************************************************************/
 
+#include <my_global.h>
 #undef SAFE_MUTEX			/* Avoid safe_mutex redefinitions */
 #include "mysys_priv.h"
 #if defined(THREAD) && defined(__WIN__)
@@ -126,7 +127,7 @@ int pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
   EnterCriticalSection(&cond->lock_waiting);
   cond->waiting--;
   
-  if (cond->waiting == 0 && result == (WAIT_OBJECT_0+BROADCAST))
+  if (cond->waiting == 0)
   {
     /*
       We're the last waiter to be notified or to stop waiting, so
diff --git a/mysys/my_winthread.c b/mysys/my_winthread.c
index e94369bec32..8bda595451b 100644
--- a/mysys/my_winthread.c
+++ b/mysys/my_winthread.c
@@ -18,6 +18,7 @@
 *****************************************************************************/
 
 /* SAFE_MUTEX will not work until the thread structure is up to date */
+#include <my_global.h>
 #undef SAFE_MUTEX
 
 #include "mysys_priv.h"
@@ -77,12 +78,15 @@ pthread_handler_t pthread_start(void *param)
 {
   pthread_handler func=((struct pthread_map *) param)->func;
   void *func_param=((struct pthread_map *) param)->param;
+  void *result;
   my_thread_init();			/* Will always succeed in windows */
   pthread_mutex_lock(&THR_LOCK_thread);	  /* Wait for beginthread to return */
   win_pthread_self=((struct pthread_map *) param)->pthreadself;
   pthread_mutex_unlock(&THR_LOCK_thread);
   free((char*) param);			  /* Free param from create */
-  pthread_exit((void*) (*func)(func_param));
+  result= (void*) (*func)(func_param);
+  my_thread_end();
+  pthread_exit(result);
   return 0;				  /* Safety */
 }
 
@@ -92,21 +96,28 @@ int pthread_create(pthread_t *thread_id, pthread_attr_t *attr,
 {
   HANDLE hThread;
   struct pthread_map *map;
+  DWORD StackSize= 0;
+  int priority= 0;
   DBUG_ENTER("pthread_create");
 
   if (!(map=malloc(sizeof(*map))))
     DBUG_RETURN(-1);
   map->func=func;
   map->param=param;
+  if (attr != NULL)
+  {
+    StackSize= attr->dwStackSize;
+    priority=  attr->priority;
+  }
+  if (StackSize == 0)
+    StackSize= PTHREAD_STACK_MIN;
   pthread_mutex_lock(&THR_LOCK_thread);
 #ifdef __BORLANDC__
   hThread=(HANDLE)_beginthread((void(_USERENTRY *)(void *)) pthread_start,
-			       attr->dwStackSize ? attr->dwStackSize :
-			       65535, (void*) map);
+			       StackSize, (void*) map);
 #else
   hThread=(HANDLE)_beginthread((void( __cdecl *)(void *)) pthread_start,
-			       attr->dwStackSize ? attr->dwStackSize :
-			       65535, (void*) map);
+			       StackSize, (void*) map);
 #endif
   DBUG_PRINT("info", ("hThread=%lu",(long) hThread));
   *thread_id=map->pthreadself=hThread;
@@ -119,7 +130,7 @@ int pthread_create(pthread_t *thread_id, pthread_attr_t *attr,
 	       ("Can't create thread to handle request (error %d)",error));
     DBUG_RETURN(error ? error : -1);
   }
-  VOID(SetThreadPriority(hThread, attr->priority)) ;
+  VOID(SetThreadPriority(hThread, priority)) ;
   DBUG_RETURN(0);
 }
 
diff --git a/mysys/my_write.c b/mysys/my_write.c
index d7eb390bdd2..52127545888 100644
--- a/mysys/my_write.c
+++ b/mysys/my_write.c
@@ -25,7 +25,7 @@ size_t my_write(int Filedes, const uchar *Buffer, size_t Count, myf MyFlags)
   size_t writenbytes, written;
   uint errors;
   DBUG_ENTER("my_write");
-  DBUG_PRINT("my",("Fd: %d  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
+  DBUG_PRINT("my",("fd: %d  Buffer: 0x%lx  Count: %lu  MyFlags: %d",
 		   Filedes, (long) Buffer, (ulong) Count, MyFlags));
   errors=0; written=0;
 
diff --git a/mysys/mysys_priv.h b/mysys/mysys_priv.h
index 6e0959ae08c..113b64005f2 100644
--- a/mysys/mysys_priv.h
+++ b/mysys/mysys_priv.h
@@ -33,6 +33,7 @@ extern pthread_mutex_t THR_LOCK_charset, THR_LOCK_time;
 #include <my_no_pthread.h>
 #endif
 
+
 /*
   EDQUOT is used only in 3 C files only in mysys/. If it does not exist on
   system, we set it to some value which can never happen.
@@ -42,3 +43,5 @@ extern pthread_mutex_t THR_LOCK_charset, THR_LOCK_time;
 #endif
 
 void my_error_unregister_all(void);
+void my_thread_destroy_mutex(void);
+my_bool my_wait_for_other_threads_to_die(uint number_of_threads);
diff --git a/mysys/safemalloc.c b/mysys/safemalloc.c
index c484f1d4c54..f507a072b69 100644
--- a/mysys/safemalloc.c
+++ b/mysys/safemalloc.c
@@ -439,7 +439,6 @@ void TERMINATE(FILE *file, uint flag)
   This is usefull to call from withing a debugger
 */
 
-
 void sf_malloc_report_allocated(void *memory)
 {
   struct st_irem *irem;  
diff --git a/mysys/test_thr_mutex.c b/mysys/test_thr_mutex.c
new file mode 100644
index 00000000000..0bd14a0d31b
--- /dev/null
+++ b/mysys/test_thr_mutex.c
@@ -0,0 +1,162 @@
+/* Copyright (C) 2008 Sun Microsystems, Inc
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Testing of deadlock detector */
+
+#include <my_global.h>
+#include <mysys_priv.h>
+
+
+int main(int argc __attribute__((unused)), char** argv)
+{
+  pthread_mutex_t LOCK_A, LOCK_B, LOCK_C, LOCK_D, LOCK_E, LOCK_F, LOCK_G;
+  pthread_mutex_t LOCK_H, LOCK_I;
+  MY_INIT(argv[0]);
+  DBUG_ENTER("main");
+
+  DBUG_PUSH("d:t:O,/tmp/trace");
+  printf("This program is testing the mutex deadlock detection.\n"
+         "It should print out different failures of wrong mutex usage"
+         "on stderr\n\n");
+
+  safe_mutex_deadlock_detector= 1;
+  pthread_mutex_init(&LOCK_A, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_B, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_C, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_D, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_E, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_F, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_G, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_H, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_I, MY_MUTEX_INIT_FAST);
+
+  printf("Testing A->B and B->A\n");
+  fflush(stdout);
+  pthread_mutex_lock(&LOCK_A);
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_B);
+
+  /* Test different (wrong) lock order */
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_lock(&LOCK_A);                  /* Should give warning */
+
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_B);
+
+  /* Check that we don't get another warning for same lock */
+  printf("Testing A->B and B->A again (should not give a warning)\n");
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_lock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_B);
+
+  /*
+    Test of ring with many mutex
+    We also unlock mutex in different orders to get the unlock code properly
+    tested.
+  */
+  printf("Testing A->C and C->D and D->A\n");
+  pthread_mutex_lock(&LOCK_A);
+  pthread_mutex_lock(&LOCK_C);
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_C);
+  pthread_mutex_lock(&LOCK_C);
+  pthread_mutex_lock(&LOCK_D);
+  pthread_mutex_unlock(&LOCK_D);
+  pthread_mutex_unlock(&LOCK_C);
+
+  pthread_mutex_lock(&LOCK_D);
+  pthread_mutex_lock(&LOCK_A);                  /* Should give warning */
+
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_D);
+
+  printf("Testing E -> F ; H -> I ; F -> H ; H -> I -> E\n");
+  fflush(stdout);
+
+  pthread_mutex_lock(&LOCK_E);
+  pthread_mutex_lock(&LOCK_F);
+  pthread_mutex_unlock(&LOCK_E);
+  pthread_mutex_unlock(&LOCK_F);
+  pthread_mutex_lock(&LOCK_H);
+  pthread_mutex_lock(&LOCK_I);
+  pthread_mutex_unlock(&LOCK_I);
+  pthread_mutex_unlock(&LOCK_H);
+  pthread_mutex_lock(&LOCK_F);
+  pthread_mutex_lock(&LOCK_H);
+  pthread_mutex_unlock(&LOCK_H);
+  pthread_mutex_unlock(&LOCK_F);
+
+  pthread_mutex_lock(&LOCK_H);
+  pthread_mutex_lock(&LOCK_I);
+  pthread_mutex_lock(&LOCK_E);                  /* Should give warning */
+
+  pthread_mutex_unlock(&LOCK_E);
+  pthread_mutex_unlock(&LOCK_I);
+  pthread_mutex_unlock(&LOCK_H);
+
+  printf("\nFollowing shouldn't give any warnings\n");
+  printf("Testing A->B and B->A without deadlock detection\n");
+  fflush(stdout);
+
+  /* Reinitialize mutex to get rid of old wrong usage markers */
+  pthread_mutex_destroy(&LOCK_A);
+  pthread_mutex_destroy(&LOCK_B);
+  pthread_mutex_init(&LOCK_A, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_B, MY_MUTEX_INIT_FAST);
+
+  /* Start testing */
+  my_pthread_mutex_lock(&LOCK_A, MYF(MYF_NO_DEADLOCK_DETECTION));
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_B);
+
+  pthread_mutex_lock(&LOCK_A);
+  my_pthread_mutex_lock(&LOCK_B, MYF(MYF_NO_DEADLOCK_DETECTION));
+  pthread_mutex_unlock(&LOCK_A);
+  pthread_mutex_unlock(&LOCK_B);
+
+  printf("Testing A -> C ; B -> C ; A->B\n");
+  fflush(stdout);
+  pthread_mutex_lock(&LOCK_A);
+  pthread_mutex_lock(&LOCK_C);
+  pthread_mutex_unlock(&LOCK_C);
+  pthread_mutex_unlock(&LOCK_A);
+
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_lock(&LOCK_C);
+  pthread_mutex_unlock(&LOCK_C);
+  pthread_mutex_unlock(&LOCK_B);
+
+  pthread_mutex_lock(&LOCK_A);
+  pthread_mutex_lock(&LOCK_B);
+  pthread_mutex_unlock(&LOCK_B);
+  pthread_mutex_unlock(&LOCK_A);
+
+  /* Cleanup */
+  pthread_mutex_destroy(&LOCK_A);
+  pthread_mutex_destroy(&LOCK_B);
+  pthread_mutex_destroy(&LOCK_C);
+  pthread_mutex_destroy(&LOCK_D);
+  pthread_mutex_destroy(&LOCK_E);
+  pthread_mutex_destroy(&LOCK_F);
+  pthread_mutex_destroy(&LOCK_G);
+  pthread_mutex_destroy(&LOCK_H);
+  pthread_mutex_destroy(&LOCK_I);
+
+  my_end(MY_DONT_FREE_DBUG);
+  exit(0);
+}
diff --git a/mysys/thr_lock.c b/mysys/thr_lock.c
index 31638ecee9a..3eabde85a58 100644
--- a/mysys/thr_lock.c
+++ b/mysys/thr_lock.c
@@ -24,7 +24,7 @@ Locks are prioritized according to:
 
 The current lock types are:
 
-TL_READ	 		# Low priority read
+TL_READ                 # Low priority read
 TL_READ_WITH_SHARED_LOCKS
 TL_READ_HIGH_PRIORITY	# High priority read
 TL_READ_NO_INSERT	# Read without concurrent inserts
@@ -57,17 +57,23 @@ check_status:
 	 In MyISAM this is a simple check if the insert can be done
 	 at the end of the datafile.
 update_status:
-	Before a write lock is released, this function is called.
-	In MyISAM this functions updates the count and length of the datafile
+        in thr_reschedule_write_lock(), when an insert delayed thread
+        downgrades TL_WRITE lock to TL_WRITE_DELAYED, to allow SELECT
+        threads to proceed.
+        A storage engine should also call update_status internally
+        in the ::external_lock(F_UNLCK) method.
+        In MyISAM and CSV this functions updates the length of the datafile.
 get_status:
 	When one gets a lock this functions is called.
 	In MyISAM this stores the number of rows and size of the datafile
 	for concurrent reads.
 
 The lock algorithm allows one to have one TL_WRITE_ALLOW_READ,
-TL_WRITE_CONCURRENT_INSERT or one TL_WRITE_DELAYED lock at the same time as
-multiple read locks.
+TL_WRITE_CONCURRENT_INSERT or one TL_WRITE_DELAYED lock at the same
+time as multiple read locks.
 
+In addition, if lock->allow_multiple_concurrent_insert is set then there can
+be any number of TL_WRITE_CONCURRENT_INSERT locks aktive at the same time.
 */
 
 #if !defined(MAIN) && !defined(DBUG_OFF) && !defined(EXTRA_DEBUG)
@@ -148,7 +154,8 @@ static int check_lock(struct st_lock_list *list, const char* lock_type,
       }
       if (same_owner &&
           !thr_lock_owner_equal(data->owner, first_owner) &&
-	  last_lock_type != TL_WRITE_ALLOW_WRITE)
+	  last_lock_type != TL_WRITE_ALLOW_WRITE &&
+          last_lock_type != TL_WRITE_CONCURRENT_INSERT)
       {
 	fprintf(stderr,
 		"Warning: Found locks from different threads in %s: %s\n",
@@ -201,7 +208,7 @@ static void check_locks(THR_LOCK *lock, const char *where,
       THR_LOCK_DATA *data;
       for (data=lock->read.data ; data ; data=data->next)
       {
-	if ((int) data->type == (int) TL_READ_NO_INSERT)
+	if (data->type == TL_READ_NO_INSERT)
 	  count++;
         /* Protect against infinite loop. */
         DBUG_ASSERT(count <= lock->read_no_write_count);
@@ -250,7 +257,22 @@ static void check_locks(THR_LOCK *lock, const char *where,
 	}	      
       }
       else
-      {						/* Have write lock */
+      {
+        /* We have at least one write lock */
+        if (lock->write.data->type == TL_WRITE_CONCURRENT_INSERT)
+        {
+          THR_LOCK_DATA *data;
+          for (data=lock->write.data->next ; data ; data=data->next)
+          {
+            if (data->type != TL_WRITE_CONCURRENT_INSERT)
+            {
+              fprintf(stderr,
+                      "Warning at '%s': Found TL_WRITE_CONCURRENT_INSERT lock mixed with other write locks\n",
+                      where);
+              break;
+            }
+          }
+        }
 	if (lock->write_wait.data)
 	{
 	  if (!allow_no_locks && 
@@ -472,7 +494,8 @@ wait_for_lock(struct st_lock_list *wait, THR_LOCK_DATA *data,
   {
     result= THR_LOCK_SUCCESS;
     if (data->lock->get_status)
-      (*data->lock->get_status)(data->status_param, 0);
+      (*data->lock->get_status)(data->status_param,
+                                data->type == TL_WRITE_CONCURRENT_INSERT);
     check_locks(data->lock,"got wait_for_lock",0);
   }
   pthread_mutex_unlock(&data->lock->mutex);
@@ -511,7 +534,8 @@ thr_lock(THR_LOCK_DATA *data, THR_LOCK_OWNER *owner,
     /* Request for READ lock */
     if (lock->write.data)
     {
-      /* We can allow a read lock even if there is already a write lock
+      /*
+        We can allow a read lock even if there is already a write lock
 	 on the table in one the following cases:
 	 - This thread alread have a write lock on the table
 	 - The write lock is TL_WRITE_ALLOW_READ or TL_WRITE_DELAYED
@@ -555,11 +579,11 @@ thr_lock(THR_LOCK_DATA *data, THR_LOCK_OWNER *owner,
       (*lock->read.last)=data;			/* Add to running FIFO */
       data->prev=lock->read.last;
       lock->read.last= &data->next;
-      if (lock->get_status)
-	(*lock->get_status)(data->status_param, 0);
       if (lock_type == TL_READ_NO_INSERT)
 	lock->read_no_write_count++;
       check_locks(lock,"read lock with no write locks",0);
+      if (lock->get_status)
+	(*lock->get_status)(data->status_param, 0);
       statistic_increment(locks_immediate,&THR_LOCK_lock);
       goto end;
     }
@@ -623,16 +647,18 @@ thr_lock(THR_LOCK_DATA *data, THR_LOCK_OWNER *owner,
 	The following test will not work if the old lock was a
 	TL_WRITE_ALLOW_WRITE, TL_WRITE_ALLOW_READ or TL_WRITE_DELAYED in
 	the same thread, but this will never happen within MySQL.
+
+        The idea is to allow us to get a lock at once if we already have
+        a write lock or if there is no pending write locks and if all
+        write locks are of the same type and are either
+        TL_WRITE_ALLOW_WRITE or TL_WRITE_CONCURRENT_INSERT
       */
       if (thr_lock_owner_equal(data->owner, lock->write.data->owner) ||
-	  (lock_type == TL_WRITE_ALLOW_WRITE &&
-	   !lock->write_wait.data &&
-	   lock->write.data->type == TL_WRITE_ALLOW_WRITE))
+          (!lock->write_wait.data && lock_type == lock->write.data->type &&
+           (lock_type == TL_WRITE_ALLOW_WRITE ||
+            (lock_type == TL_WRITE_CONCURRENT_INSERT &&
+             lock->allow_multiple_concurrent_insert))))
       {
-	/*
-          We have already got a write lock or all locks are
-          TL_WRITE_ALLOW_WRITE
-        */
         DBUG_PRINT("info", ("write_wait.data: 0x%lx  old_type: %d",
                             (ulong) lock->write_wait.data,
                             lock->write.data->type));
@@ -641,8 +667,9 @@ thr_lock(THR_LOCK_DATA *data, THR_LOCK_OWNER *owner,
 	data->prev=lock->write.last;
 	lock->write.last= &data->next;
 	check_locks(lock,"second write lock",0);
-	if (data->lock->get_status)
-	  (*data->lock->get_status)(data->status_param, 0);
+	if (lock->get_status)
+	  (*lock->get_status)(data->status_param,
+                              lock_type == TL_WRITE_CONCURRENT_INSERT);
 	statistic_increment(locks_immediate,&THR_LOCK_lock);
 	goto end;
       }
@@ -675,8 +702,8 @@ thr_lock(THR_LOCK_DATA *data, THR_LOCK_OWNER *owner,
 	  (*lock->write.last)=data;		/* Add as current write lock */
 	  data->prev=lock->write.last;
 	  lock->write.last= &data->next;
-	  if (data->lock->get_status)
-	    (*data->lock->get_status)(data->status_param, concurrent_insert);
+	  if (lock->get_status)
+	    (*lock->get_status)(data->status_param, concurrent_insert);
 	  check_locks(lock,"only write lock",0);
 	  statistic_increment(locks_immediate,&THR_LOCK_lock);
 	  goto end;
@@ -784,16 +811,6 @@ void thr_unlock(THR_LOCK_DATA *data)
   }
   else
     lock->write.last=data->prev;
-  if (lock_type >= TL_WRITE_CONCURRENT_INSERT)
-  {
-    if (lock->update_status)
-      (*lock->update_status)(data->status_param);
-  }
-  else
-  {
-    if (lock->restore_status)
-      (*lock->restore_status)(data->status_param);
-  }
   if (lock_type == TL_READ_NO_INSERT)
     lock->read_no_write_count--;
   data->type=TL_UNLOCK;				/* Mark unlocked */
@@ -816,7 +833,6 @@ static void wake_up_waiters(THR_LOCK *lock)
 {
   THR_LOCK_DATA *data;
   enum thr_lock_type lock_type;
-
   DBUG_ENTER("wake_up_waiters");
 
   if (!lock->write.data)			/* If no active write locks */
@@ -1380,8 +1396,8 @@ my_bool thr_upgrade_write_delay_lock(THR_LOCK_DATA *data,
   {
     if (!lock->read.data)			/* No read locks */
     {						/* We have the lock */
-      if (data->lock->get_status)
-	(*data->lock->get_status)(data->status_param, 0);
+      if (lock->get_status)
+	(*lock->get_status)(data->status_param, 0);
       pthread_mutex_unlock(&lock->mutex);
       DBUG_RETURN(0);
     }
@@ -1521,7 +1537,7 @@ struct st_test {
   enum thr_lock_type lock_type;
 };
 
-THR_LOCK locks[5];			/* 4 locks */
+THR_LOCK locks[6];			/* Number of locks +1 */
 
 struct st_test test_0[] = {{0,TL_READ}};	/* One lock */
 struct st_test test_1[] = {{0,TL_READ},{0,TL_WRITE}}; /* Read and write lock of lock 0 */
@@ -1541,9 +1557,20 @@ struct st_test test_14[] = {{0,TL_WRITE_CONCURRENT_INSERT},{1,TL_READ}};
 struct st_test test_15[] = {{0,TL_WRITE_ALLOW_WRITE},{1,TL_READ}};
 struct st_test test_16[] = {{0,TL_WRITE_ALLOW_WRITE},{1,TL_WRITE_ALLOW_WRITE}};
 
-struct st_test *tests[] = {test_0,test_1,test_2,test_3,test_4,test_5,test_6,
-			   test_7,test_8,test_9,test_10,test_11,test_12,
-			   test_13,test_14,test_15,test_16};
+struct st_test test_17[] = {{5,TL_WRITE_CONCURRENT_INSERT}};
+struct st_test test_18[] = {{5,TL_WRITE_CONCURRENT_INSERT}};
+struct st_test test_19[] = {{5,TL_READ}};
+struct st_test test_20[] = {{5,TL_READ_NO_INSERT}};
+struct st_test test_21[] = {{5,TL_WRITE}};
+
+
+struct st_test *tests[]=
+{
+  test_0, test_1, test_2, test_3, test_4, test_5, test_6, test_7, test_8,
+  test_9, test_10, test_11, test_12, test_13, test_14, test_15, test_16,
+  test_17, test_18, test_19, test_20, test_21
+};
+
 int lock_counts[]= {sizeof(test_0)/sizeof(struct st_test),
 		    sizeof(test_1)/sizeof(struct st_test),
 		    sizeof(test_2)/sizeof(struct st_test),
@@ -1560,7 +1587,12 @@ int lock_counts[]= {sizeof(test_0)/sizeof(struct st_test),
 		    sizeof(test_13)/sizeof(struct st_test),
 		    sizeof(test_14)/sizeof(struct st_test),
 		    sizeof(test_15)/sizeof(struct st_test),
-		    sizeof(test_16)/sizeof(struct st_test)
+		    sizeof(test_16)/sizeof(struct st_test),
+		    sizeof(test_17)/sizeof(struct st_test),
+		    sizeof(test_18)/sizeof(struct st_test),
+		    sizeof(test_19)/sizeof(struct st_test),
+		    sizeof(test_20)/sizeof(struct st_test),
+		    sizeof(test_21)/sizeof(struct st_test)
 };
 
 
@@ -1604,7 +1636,6 @@ static void *test_thread(void *arg)
 
   printf("Thread %s (%d) started\n",my_thread_name(),param); fflush(stdout);
 
-
   thr_lock_info_init(&lock_info);
   thr_lock_owner_init(&owner, &lock_info);
   for (i=0; i < lock_counts[param] ; i++)
@@ -1650,7 +1681,8 @@ int main(int argc __attribute__((unused)),char **argv __attribute__((unused)))
 {
   pthread_t tid;
   pthread_attr_t thr_attr;
-  int i,*param,error;
+  int *param,error;
+  uint i;
   MY_INIT(argv[0]);
   if (argc > 1 && argv[1][0] == '-' && argv[1][1] == '#')
     DBUG_PUSH(argv[1]+2);
@@ -1670,13 +1702,14 @@ int main(int argc __attribute__((unused)),char **argv __attribute__((unused)))
     exit(1);
   }
 
-  for (i=0 ; i < (int) array_elements(locks) ; i++)
+  for (i=0 ; i < array_elements(locks) ; i++)
   {
     thr_lock_init(locks+i);
     locks[i].check_status= test_check_status;
     locks[i].update_status=test_update_status;
     locks[i].copy_status=  test_copy_status;
     locks[i].get_status=   test_get_status;
+    locks[i].allow_multiple_concurrent_insert= 1;
   }
   if ((error=pthread_attr_init(&thr_attr)))
   {
@@ -1702,7 +1735,7 @@ int main(int argc __attribute__((unused)),char **argv __attribute__((unused)))
 #ifdef HAVE_THR_SETCONCURRENCY
   VOID(thr_setconcurrency(2));
 #endif
-  for (i=0 ; i < (int) array_elements(lock_counts) ; i++)
+  for (i=0 ; i < array_elements(lock_counts) ; i++)
   {
     param=(int*) malloc(sizeof(int));
     *param=i;
@@ -1734,7 +1767,7 @@ int main(int argc __attribute__((unused)),char **argv __attribute__((unused)))
   }
   if ((error=pthread_mutex_unlock(&LOCK_thread_count)))
     fprintf(stderr,"Got error: %d from pthread_mutex_unlock\n",error);
-  for (i=0 ; i < (int) array_elements(locks) ; i++)
+  for (i=0 ; i < array_elements(locks) ; i++)
     thr_lock_delete(locks+i);
 #ifdef EXTRA_DEBUG
   if (found_errors)
diff --git a/mysys/thr_mutex.c b/mysys/thr_mutex.c
index 8f9928026ba..80f21e53473 100644
--- a/mysys/thr_mutex.c
+++ b/mysys/thr_mutex.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2003 MySQL AB
+/* Copyright (C) 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -19,11 +19,16 @@
 #if defined(TARGET_OS_LINUX) && !defined (__USE_UNIX98)
 #define __USE_UNIX98			/* To get rw locks under Linux */
 #endif
-#if defined(THREAD) && defined(SAFE_MUTEX)
+#ifdef SAFE_MUTEX
+#define SAFE_MUTEX_DEFINED
+#endif
+
+#if defined(THREAD)
 #undef SAFE_MUTEX			/* Avoid safe_mutex redefinitions */
 #include "mysys_priv.h"
 #include "my_static.h"
 #include <m_string.h>
+#include <hash.h>
 
 #ifndef DO_NOT_REMOVE_THREAD_WRAPPERS
 /* Remove wrappers */
@@ -34,34 +39,164 @@
 #undef pthread_mutex_destroy
 #undef pthread_cond_wait
 #undef pthread_cond_timedwait
+#undef safe_mutex_free_deadlock_data
 #ifdef HAVE_NONPOSIX_PTHREAD_MUTEX_INIT
-#define pthread_mutex_init(a,b) my_pthread_mutex_init((a),(b))
+#define pthread_mutex_init(a,b) my_pthread_noposix_mutex_init((a),(b))
 #endif
 #endif /* DO_NOT_REMOVE_THREAD_WRAPPERS */
 
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+pthread_mutexattr_t my_fast_mutexattr;
+#endif
+#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
+pthread_mutexattr_t my_errorcheck_mutexattr;
+#endif
+#ifdef SAFE_MUTEX_DEFINED
 static pthread_mutex_t THR_LOCK_mutex;
 static ulong safe_mutex_count= 0;		/* Number of mutexes created */
+static ulong safe_mutex_id= 0;
+my_bool safe_mutex_deadlock_detector= 1;        /* On by default */
+
 #ifdef SAFE_MUTEX_DETECT_DESTROY
-static struct st_safe_mutex_info_t *safe_mutex_root= NULL;
+static struct st_safe_mutex_create_info_t *safe_mutex_create_root= NULL;
+#endif
+
+static my_bool add_used_to_locked_mutex(safe_mutex_t *used_mutex,
+                                        safe_mutex_deadlock_t *locked_mutex);
+static my_bool add_to_locked_mutex(safe_mutex_deadlock_t *locked_mutex,
+                                   safe_mutex_t *current_mutex);
+static my_bool remove_from_locked_mutex(safe_mutex_t *mp,
+                                        safe_mutex_t *delete_mutex);
+static my_bool remove_from_used_mutex(safe_mutex_deadlock_t *locked_mutex,
+                                      safe_mutex_t *mutex);
+static void print_deadlock_warning(safe_mutex_t *new_mutex,
+                                   safe_mutex_t *conflicting_mutex);
 #endif
 
+
+/* Initialize all mutex handling */
+
+void my_mutex_init()
+{
+  /* Initialize mutex attributes */
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  /*
+    Set mutex type to "fast" a.k.a "adaptive"
+
+    In this case the thread may steal the mutex from some other thread
+    that is waiting for the same mutex.  This will save us some
+    context switches but may cause a thread to 'starve forever' while
+    waiting for the mutex (not likely if the code within the mutex is
+    short).
+  */
+  pthread_mutexattr_init(&my_fast_mutexattr);
+  pthread_mutexattr_settype(&my_fast_mutexattr,
+                            PTHREAD_MUTEX_ADAPTIVE_NP);
+#endif
+#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
+  /*
+    Set mutex type to "errorcheck"
+  */
+  pthread_mutexattr_init(&my_errorcheck_mutexattr);
+  pthread_mutexattr_settype(&my_errorcheck_mutexattr,
+                            PTHREAD_MUTEX_ERRORCHECK);
+#endif
+
+#if defined(SAFE_MUTEX_DEFINED)
+  safe_mutex_global_init();
+#elif defined(MY_PTHREAD_FASTMUTEX)
+  fastmutex_global_init();
+#endif
+}
+
+void my_mutex_end()
+{
+#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  pthread_mutexattr_destroy(&my_fast_mutexattr);
+#endif
+#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
+  pthread_mutexattr_destroy(&my_errorcheck_mutexattr);
+#endif
+}
+
+
+/* Initialize safe_mutex handling */
+
+#ifdef SAFE_MUTEX_DEFINED
 void safe_mutex_global_init(void)
 {
   pthread_mutex_init(&THR_LOCK_mutex,MY_MUTEX_INIT_FAST);
+  safe_mutex_id= safe_mutex_count= 0;
+  safe_mutex_deadlock_detector= 1;
+
+#ifdef SAFE_MUTEX_DETECT_DESTROY
+  safe_mutex_create_root= 0;
+#endif
+}
+
+static inline void remove_from_active_list(safe_mutex_t *mp)
+{
+  if (!(mp->active_flags & (MYF_NO_DEADLOCK_DETECTION | MYF_TRY_LOCK)))
+  {
+    /* Remove mutex from active mutex linked list */
+    if (mp->next)
+      mp->next->prev= mp->prev;
+    if (mp->prev)
+      mp->prev->next= mp->next;
+    else
+      *my_thread_var_mutex_in_use()= mp->next;
+  }
+  mp->prev= mp->next= 0;
 }
 
 
 int safe_mutex_init(safe_mutex_t *mp,
 		    const pthread_mutexattr_t *attr __attribute__((unused)),
+                    const char *name,
+                    myf my_flags,
 		    const char *file,
 		    uint line)
 {
+  DBUG_ENTER("safe_mutex_init");
+  DBUG_PRINT("enter",("mutex: 0x%lx  name: %s", (ulong) mp, name));
   bzero((char*) mp,sizeof(*mp));
   pthread_mutex_init(&mp->global,MY_MUTEX_INIT_ERRCHK);
   pthread_mutex_init(&mp->mutex,attr);
   /* Mark that mutex is initialized */
   mp->file= file;
   mp->line= line;
+  /* Skip the very common '&' prefix from the autogenerated name */
+  mp->name= name[0] == '&' ? name + 1 : name;
+
+  if (safe_mutex_deadlock_detector && !( my_flags & MYF_NO_DEADLOCK_DETECTION))
+  {
+    if (!my_multi_malloc(MY_FAE | MY_WME,
+                         &mp->locked_mutex, sizeof(*mp->locked_mutex),
+                         &mp->used_mutex, sizeof(*mp->used_mutex), NullS))
+    {
+      /* Disable deadlock handling for this mutex */
+      my_flags|= MYF_NO_DEADLOCK_DETECTION;
+    }
+    else
+    {
+      pthread_mutex_lock(&THR_LOCK_mutex);
+      mp->id= ++safe_mutex_id;
+      pthread_mutex_unlock(&THR_LOCK_mutex);
+      hash_init(mp->locked_mutex, &my_charset_bin,
+                1000,
+                offsetof(safe_mutex_deadlock_t, id),
+                sizeof(mp->id),
+                0, 0, HASH_UNIQUE);
+      hash_init(mp->used_mutex, &my_charset_bin,
+                1000,
+                offsetof(safe_mutex_t, id),
+                sizeof(mp->id),
+                0, 0, HASH_UNIQUE);
+    }
+  }
+  else
+    my_flags|= MYF_NO_DEADLOCK_DETECTION;
+  mp->create_flags= my_flags;
 
 #ifdef SAFE_MUTEX_DETECT_DESTROY
   /*
@@ -70,7 +205,7 @@ int safe_mutex_init(safe_mutex_t *mp,
   */
   if ((mp->info= (safe_mutex_info_t *) malloc(sizeof(safe_mutex_info_t))))
   {
-    struct st_safe_mutex_info_t *info =mp->info;
+    struct st_safe_mutex_info_t *info= mp->info;
 
     info->init_file= file;
     info->init_line= line;
@@ -78,22 +213,25 @@ int safe_mutex_init(safe_mutex_t *mp,
     info->next= NULL;
 
     pthread_mutex_lock(&THR_LOCK_mutex);
-    if ((info->next= safe_mutex_root))
-      safe_mutex_root->prev= info;
-    safe_mutex_root= info;
+    if ((info->next= safe_mutex_create_root))
+      safe_mutex_create_root->prev= info;
+    safe_mutex_create_root= info;
     safe_mutex_count++;
     pthread_mutex_unlock(&THR_LOCK_mutex);
   }
 #else
   thread_safe_increment(safe_mutex_count, &THR_LOCK_mutex);
 #endif /* SAFE_MUTEX_DETECT_DESTROY */
-  return 0;
+  DBUG_RETURN(0);
 }
 
 
-int safe_mutex_lock(safe_mutex_t *mp, my_bool try_lock, const char *file, uint line)
+int safe_mutex_lock(safe_mutex_t *mp, myf my_flags, const char *file,
+                    uint line)
 {
   int error;
+  DBUG_PRINT("mutex", ("%s (0x%lx) locking", mp->name ? mp->name : "Null",
+                       (ulong) mp));
   if (!mp->file)
   {
     fprintf(stderr,
@@ -106,12 +244,13 @@ int safe_mutex_lock(safe_mutex_t *mp, my_bool try_lock, const char *file, uint l
   pthread_mutex_lock(&mp->global);
   if (mp->count > 0)
   {
-    if (try_lock)
-    {
-      pthread_mutex_unlock(&mp->global);
-      return EBUSY;
-    }
-    else if (pthread_equal(pthread_self(),mp->thread))
+    /*
+      Check that we are not trying to lock mutex twice. This is an error
+      even if we are using 'try_lock' as it's not portably what happens
+      if you lock the mutex many times and this is in any case bad
+      behaviour that should not be encouraged
+    */
+    if (pthread_equal(pthread_self(),mp->thread))
     {
       fprintf(stderr,
               "safe_mutex: Trying to lock mutex at %s, line %d, when the"
@@ -139,7 +278,7 @@ int safe_mutex_lock(safe_mutex_t *mp, my_bool try_lock, const char *file, uint l
       instead just return EBUSY, since this is the expected behaviour
       of trylock().
    */
-  if (try_lock)
+  if (my_flags & MYF_TRY_LOCK)
   {
     error= pthread_mutex_trylock(&mp->mutex);
     if (error == EBUSY)
@@ -150,22 +289,109 @@ int safe_mutex_lock(safe_mutex_t *mp, my_bool try_lock, const char *file, uint l
 
   if (error || (error=pthread_mutex_lock(&mp->global)))
   {
-    fprintf(stderr,"Got error %d when trying to lock mutex at %s, line %d\n",
-	    error, file, line);
+    fprintf(stderr,"Got error %d when trying to lock mutex %s at %s, line %d\n",
+	    error, mp->name, file, line);
     fflush(stderr);
     abort();
   }
   mp->thread= pthread_self();
   if (mp->count++)
   {
-    fprintf(stderr,"safe_mutex: Error in thread libray: Got mutex at %s, \
-line %d more than 1 time\n", file,line);
+    fprintf(stderr,"safe_mutex: Error in thread libray: Got mutex %s at %s, "
+            "line %d more than 1 time\n", mp->name, file,line);
     fflush(stderr);
     abort();
   }
   mp->file= file;
-  mp->line=line;
+  mp->line= line;
+  mp->active_flags= mp->create_flags | my_flags;
   pthread_mutex_unlock(&mp->global);
+
+  /* Deadlock detection */
+
+  mp->prev= mp->next= 0;
+  if (!(mp->active_flags & (MYF_TRY_LOCK | MYF_NO_DEADLOCK_DETECTION)))
+  {
+    safe_mutex_t **mutex_in_use= my_thread_var_mutex_in_use();
+
+    if (!mutex_in_use)
+    {
+      /* thread has not called my_thread_init() */
+      mp->active_flags|= MYF_NO_DEADLOCK_DETECTION;
+    }
+    else
+    {
+      safe_mutex_t *mutex_root;
+      if ((mutex_root= *mutex_in_use))   /* If not first locked */
+      {
+        /*
+          Protect locked_mutex against changes if a mutex is deleted
+        */
+        pthread_mutex_lock(&THR_LOCK_mutex);
+
+        if (!hash_search(mutex_root->locked_mutex, (uchar*) &mp->id, 0))
+        {
+          safe_mutex_deadlock_t *deadlock;
+          safe_mutex_t *mutex;
+
+          /* Create object to store mutex info */
+          if (!(deadlock= my_malloc(sizeof(*deadlock),
+                                    MYF(MY_ZEROFILL | MY_WME | MY_FAE))))
+            goto abort_loop;
+          deadlock->name= mp->name;
+          deadlock->id= mp->id;
+          deadlock->mutex= mp;
+          /* The following is useful for debugging wrong mutex usage */
+          deadlock->file= file;
+          deadlock->line= line;
+
+          /* Check if potential deadlock */
+          mutex= mutex_root;
+          do
+          {
+            if (hash_search(mp->locked_mutex, (uchar*) &mutex->id, 0))
+            {
+              print_deadlock_warning(mp, mutex);
+              /* Mark wrong usage to avoid future warnings for same error */
+              deadlock->warning_only= 1;
+              add_to_locked_mutex(deadlock, mutex_root);
+              DBUG_ASSERT(deadlock->count > 0);
+              goto abort_loop;
+            }
+          }
+          while ((mutex= mutex->next));
+
+          /*
+            Copy current mutex and all mutex that has been locked
+            after current mutex (mp->locked_mutex) to all mutex that
+            was locked before previous mutex (mutex_root->used_mutex)
+
+            For example if A->B would have been done before and we
+            are now locking (C) in B->C, then we would add C into
+            B->locked_mutex and A->locked_mutex
+          */
+          my_hash_iterate(mutex_root->used_mutex,
+                          (my_hash_walk_action) add_used_to_locked_mutex,
+                          deadlock);
+
+          /*
+            Copy all current mutex and all mutex locked after current one
+            into the prev mutex
+          */
+          add_used_to_locked_mutex(mutex_root, deadlock);
+          DBUG_ASSERT(deadlock->count > 0);
+        }
+  abort_loop:
+        pthread_mutex_unlock(&THR_LOCK_mutex);
+      }
+      /* Link mutex into mutex_in_use list */
+      if ((mp->next= *mutex_in_use))
+        (*mutex_in_use)->prev= mp;
+      *mutex_in_use= mp;
+    }
+  }
+
+  DBUG_PRINT("mutex", ("%s (0x%lx) locked", mp->name, (ulong) mp));
   return error;
 }
 
@@ -173,23 +399,34 @@ line %d more than 1 time\n", file,line);
 int safe_mutex_unlock(safe_mutex_t *mp,const char *file, uint line)
 {
   int error;
+  DBUG_PRINT("mutex", ("%s (0x%lx) unlocking", mp->name, (ulong) mp));
   pthread_mutex_lock(&mp->global);
   if (mp->count == 0)
   {
-    fprintf(stderr,"safe_mutex: Trying to unlock mutex that wasn't locked at %s, line %d\n            Last used at %s, line: %d\n",
-	    file,line,mp->file ? mp->file : "",mp->line);
+    fprintf(stderr,
+            "safe_mutex: Trying to unlock mutex %s that wasn't locked at "
+            "%s, line %d\n"
+            "Last used at %s, line: %d\n",
+	    mp->name ? mp->name : "Null", file, line,
+            mp->file ? mp->file : "Null", mp->line);
     fflush(stderr);
     abort();
   }
   if (!pthread_equal(pthread_self(),mp->thread))
   {
-    fprintf(stderr,"safe_mutex: Trying to unlock mutex at %s, line %d  that was locked by another thread at: %s, line: %d\n",
-	    file,line,mp->file,mp->line);
+    fprintf(stderr,
+            "safe_mutex: Trying to unlock mutex %s at %s, line %d that was "
+            "locked by "
+            "another thread at: %s, line: %d\n",
+	    mp->name, file, line, mp->file, mp->line);
     fflush(stderr);
     abort();
   }
   mp->thread= 0;
   mp->count--;
+
+  remove_from_active_list(mp);
+
 #ifdef __WIN__
   pthread_mutex_unlock(&mp->mutex);
   error=0;
@@ -197,7 +434,9 @@ int safe_mutex_unlock(safe_mutex_t *mp,const char *file, uint line)
   error=pthread_mutex_unlock(&mp->mutex);
   if (error)
   {
-    fprintf(stderr,"safe_mutex: Got error: %d (%d) when trying to unlock mutex at %s, line %d\n", error, errno, file, line);
+    fprintf(stderr,
+            "safe_mutex: Got error: %d (%d) when trying to unlock mutex "
+            "%s at %s, line %d\n", error, errno, mp->name, file, line);
     fflush(stderr);
     abort();
   }
@@ -211,43 +450,62 @@ int safe_cond_wait(pthread_cond_t *cond, safe_mutex_t *mp, const char *file,
 		   uint line)
 {
   int error;
+  safe_mutex_t save_state;
+
   pthread_mutex_lock(&mp->global);
   if (mp->count == 0)
   {
-    fprintf(stderr,"safe_mutex: Trying to cond_wait on a unlocked mutex at %s, line %d\n",file,line);
+    fprintf(stderr,
+            "safe_mutex: Trying to cond_wait on a unlocked mutex %s at %s, "
+            "line %d\n",
+            mp->name ? mp->name : "Null", file, line);
     fflush(stderr);
     abort();
   }
   if (!pthread_equal(pthread_self(),mp->thread))
   {
-    fprintf(stderr,"safe_mutex: Trying to cond_wait on a mutex at %s, line %d  that was locked by another thread at: %s, line: %d\n",
-	    file,line,mp->file,mp->line);
+    fprintf(stderr,
+            "safe_mutex: Trying to cond_wait on a mutex %s at %s, line %d "
+            "that was locked by another thread at: %s, line: %d\n",
+	    mp->name, file, line, mp->file, mp->line);
     fflush(stderr);
     abort();
   }
 
   if (mp->count-- != 1)
   {
-    fprintf(stderr,"safe_mutex:  Count was %d on locked mutex at %s, line %d\n",
-	    mp->count+1, file, line);
+    fprintf(stderr,
+            "safe_mutex:  Count was %d on locked mutex %s at %s, line %d\n",
+	    mp->count+1, mp->name, file, line);
     fflush(stderr);
     abort();
   }
+  save_state= *mp;
+  remove_from_active_list(mp);
   pthread_mutex_unlock(&mp->global);
   error=pthread_cond_wait(cond,&mp->mutex);
   pthread_mutex_lock(&mp->global);
+
   if (error)
   {
-    fprintf(stderr,"safe_mutex: Got error: %d (%d) when doing a safe_mutex_wait at %s, line %d\n", error, errno, file, line);
+    fprintf(stderr,
+            "safe_mutex: Got error: %d (%d) when doing a safe_mutex_wait on "
+            "%s at %s, line %d\n", error, errno, mp->name, file, line);
     fflush(stderr);
     abort();
   }
-  mp->thread=pthread_self();
+  /* Restore state as it was before */
+  mp->thread=       save_state.thread;
+  mp->active_flags= save_state.active_flags;
+  mp->next=         save_state.next;
+  mp->prev=         save_state.prev;
+
   if (mp->count++)
   {
     fprintf(stderr,
-	    "safe_mutex:  Count was %d in thread 0x%lx when locking mutex at %s, line %d\n",
-	    mp->count-1, my_thread_dbug_id(), file, line);
+	    "safe_mutex:  Count was %d in thread 0x%lx when locking mutex %s "
+            "at %s, line %d\n",
+	    mp->count-1, my_thread_dbug_id(), mp->name, file, line);
     fflush(stderr);
     abort();
   }
@@ -263,29 +521,46 @@ int safe_cond_timedwait(pthread_cond_t *cond, safe_mutex_t *mp,
 			const char *file, uint line)
 {
   int error;
+  safe_mutex_t save_state;
+
   pthread_mutex_lock(&mp->global);
   if (mp->count != 1 || !pthread_equal(pthread_self(),mp->thread))
   {
-    fprintf(stderr,"safe_mutex: Trying to cond_wait at %s, line %d on a not hold mutex\n",file,line);
+    fprintf(stderr,
+            "safe_mutex: Trying to cond_wait at %s, line %d on a not hold "
+            "mutex %s\n",
+            file, line, mp->name ? mp->name : "Null");
     fflush(stderr);
     abort();
   }
   mp->count--;					/* Mutex will be released */
+  save_state= *mp;
+  remove_from_active_list(mp);
   pthread_mutex_unlock(&mp->global);
   error=pthread_cond_timedwait(cond,&mp->mutex,abstime);
 #ifdef EXTRA_DEBUG
   if (error && (error != EINTR && error != ETIMEDOUT && error != ETIME))
   {
-    fprintf(stderr,"safe_mutex: Got error: %d (%d) when doing a safe_mutex_timedwait at %s, line %d\n", error, errno, file, line);
+    fprintf(stderr,
+            "safe_mutex: Got error: %d (%d) when doing a safe_mutex_timedwait "
+            "on %s at %s, line %d\n",
+            error, errno, mp->name, file, line);
   }
 #endif
   pthread_mutex_lock(&mp->global);
-  mp->thread=pthread_self();
+  /* Restore state as it was before */
+  mp->thread=       save_state.thread;
+  mp->active_flags= save_state.active_flags;
+  mp->next=         save_state.next;
+  mp->prev=         save_state.prev;
+
   if (mp->count++)
   {
     fprintf(stderr,
-	    "safe_mutex:  Count was %d in thread 0x%lx when locking mutex at %s, line %d (error: %d (%d))\n",
-	    mp->count-1, my_thread_dbug_id(), file, line, error, error);
+	    "safe_mutex:  Count was %d in thread 0x%lx when locking mutex "
+            "%s at %s, line %d (error: %d (%d))\n",
+	    mp->count-1, my_thread_dbug_id(), mp->name, file, line,
+            error, error);
     fflush(stderr);
     abort();
   }
@@ -299,6 +574,8 @@ int safe_cond_timedwait(pthread_cond_t *cond, safe_mutex_t *mp,
 int safe_mutex_destroy(safe_mutex_t *mp, const char *file, uint line)
 {
   int error=0;
+  DBUG_ENTER("safe_mutex_destroy");
+  DBUG_PRINT("enter", ("mutex: 0x%lx  name: %s", (ulong) mp, mp->name));
   if (!mp->file)
   {
     fprintf(stderr,
@@ -309,11 +586,17 @@ int safe_mutex_destroy(safe_mutex_t *mp, const char *file, uint line)
   }
   if (mp->count != 0)
   {
-    fprintf(stderr,"safe_mutex: Trying to destroy a mutex that was locked at %s, line %d at %s, line %d\n",
-	    mp->file,mp->line, file, line);
+    fprintf(stderr,
+            "safe_mutex: Trying to destroy a mutex %s that was locked at %s, "
+            "line %d at %s, line %d\n",
+	    mp->name, mp->file, mp->line, file, line);
     fflush(stderr);
     abort();
   }
+
+  /* Free all entries that points to this one */
+  safe_mutex_free_deadlock_data(mp);
+
 #ifdef __WIN__ 
   pthread_mutex_destroy(&mp->global);
   pthread_mutex_destroy(&mp->mutex);
@@ -334,7 +617,7 @@ int safe_mutex_destroy(safe_mutex_t *mp, const char *file, uint line)
     if (info->prev)
       info->prev->next = info->next;
     else
-      safe_mutex_root = info->next;
+      safe_mutex_create_root = info->next;
     if (info->next)
       info->next->prev = info->prev;
     safe_mutex_count--;
@@ -346,10 +629,38 @@ int safe_mutex_destroy(safe_mutex_t *mp, const char *file, uint line)
 #else
   thread_safe_sub(safe_mutex_count, 1, &THR_LOCK_mutex);
 #endif /* SAFE_MUTEX_DETECT_DESTROY */
-  return error;
+  DBUG_RETURN(error);
 }
 
 
+/**
+  Free all data related to deadlock detection
+
+  This is also useful together with safemalloc when you don't want to
+  have reports of not freed memory for mysys mutexes.
+*/
+
+void safe_mutex_free_deadlock_data(safe_mutex_t *mp)
+{
+  /* Free all entries that points to this one */
+  if (!(mp->create_flags & MYF_NO_DEADLOCK_DETECTION))
+  {
+    pthread_mutex_lock(&THR_LOCK_mutex);
+    my_hash_iterate(mp->used_mutex,
+                    (my_hash_walk_action) remove_from_locked_mutex,
+                    mp);
+    my_hash_iterate(mp->locked_mutex,
+                    (my_hash_walk_action) remove_from_used_mutex,
+                    mp);
+    pthread_mutex_unlock(&THR_LOCK_mutex);
+
+    hash_free(mp->used_mutex);
+    hash_free(mp->locked_mutex);
+    my_free(mp->locked_mutex, 0);
+    mp->create_flags|= MYF_NO_DEADLOCK_DETECTION;
+  }
+}
+
 /*
   Free global resources and check that all mutex has been destroyed
 
@@ -380,16 +691,145 @@ void safe_mutex_end(FILE *file __attribute__((unused)))
   }
   {
     struct st_safe_mutex_info_t *ptr;
-    for (ptr= safe_mutex_root ; ptr ; ptr= ptr->next)
+    for (ptr= safe_mutex_create_root ; ptr ; ptr= ptr->next)
     {
-      fprintf(file, "\tMutex initiated at line %4u in '%s'\n",
-	      ptr->init_line, ptr->init_file);
+      fprintf(file, "\tMutex %s initiated at line %4u in '%s'\n",
+	      ptr->name, ptr->init_line, ptr->init_file);
       (void) fflush(file);
     }
   }
 #endif /* SAFE_MUTEX_DETECT_DESTROY */
 }
 
+
+static my_bool add_used_to_locked_mutex(safe_mutex_t *used_mutex,
+                                        safe_mutex_deadlock_t *locked_mutex)
+{
+  /* Add mutex to all parent of the current mutex */
+  if (!locked_mutex->warning_only)
+  {
+    (void) my_hash_iterate(locked_mutex->mutex->locked_mutex,
+                           (my_hash_walk_action) add_to_locked_mutex,
+                           used_mutex);
+    /* mark that locked_mutex is locked after used_mutex */
+    (void) add_to_locked_mutex(locked_mutex, used_mutex);
+  }
+  return 0;
+}
+
+
+/**
+   register that locked_mutex was locked after current_mutex
+*/
+
+static my_bool add_to_locked_mutex(safe_mutex_deadlock_t *locked_mutex,
+                                   safe_mutex_t *current_mutex)
+{
+  DBUG_ENTER("add_to_locked_mutex");
+  DBUG_PRINT("info", ("inserting 0x%lx  into  0x%lx  (id: %lu -> %lu)",
+                      (ulong) locked_mutex, (long) current_mutex,
+                      locked_mutex->id, current_mutex->id));
+  if (my_hash_insert(current_mutex->locked_mutex, (uchar*) locked_mutex))
+  {
+    /* Got mutex through two paths; ignore */
+    DBUG_RETURN(0);
+  }
+  locked_mutex->count++;
+  if (my_hash_insert(locked_mutex->mutex->used_mutex,
+                     (uchar*) current_mutex))
+  {
+    DBUG_ASSERT(0);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Remove mutex from the locked mutex hash
+  @fn    remove_from_used_mutex()
+  @param mp            Mutex that has delete_mutex in it's locked_mutex hash
+  @param delete_mutex  Mutex should be removed from the hash
+
+  @notes
+    safe_mutex_deadlock_t entries in the locked hash are shared.
+    When counter goes to 0, we delete the safe_mutex_deadlock_t entry.
+*/
+
+static my_bool remove_from_locked_mutex(safe_mutex_t *mp,
+                                        safe_mutex_t *delete_mutex)
+{
+  safe_mutex_deadlock_t *found;
+  DBUG_ENTER("remove_from_locked_mutex");
+  DBUG_PRINT("enter", ("delete_mutex: 0x%lx  mutex: 0x%lx  (id: %lu <- %lu)",
+                       (ulong) delete_mutex, (ulong) mp, 
+                       delete_mutex->id, mp->id));
+
+  found= (safe_mutex_deadlock_t *) hash_search(mp->locked_mutex,
+                                               (uchar*) &delete_mutex->id, 0);
+  DBUG_ASSERT(found);
+  if (found)
+  {
+    if (hash_delete(mp->locked_mutex, (uchar*) found))
+    {
+      DBUG_ASSERT(0);
+    }
+    if (!--found->count)
+      my_free(found, MYF(0));
+  }
+  DBUG_RETURN(0);
+}
+
+static my_bool remove_from_used_mutex(safe_mutex_deadlock_t *locked_mutex,
+                                      safe_mutex_t *mutex)
+{
+  DBUG_ENTER("remove_from_used_mutex");
+  DBUG_PRINT("enter", ("delete_mutex: 0x%lx  mutex: 0x%lx  (id: %lu <- %lu)",
+                       (ulong) mutex, (ulong) locked_mutex, 
+                       mutex->id, locked_mutex->id));
+  if (hash_delete(locked_mutex->mutex->used_mutex, (uchar*) mutex))
+  {
+    DBUG_ASSERT(0);
+  }
+  if (!--locked_mutex->count)
+    my_free(locked_mutex, MYF(0));
+  DBUG_RETURN(0);
+}
+
+
+static void print_deadlock_warning(safe_mutex_t *new_mutex,
+                                   safe_mutex_t *parent_mutex)
+{
+  safe_mutex_t *mutex_root;
+  DBUG_ENTER("print_deadlock_warning");
+  DBUG_PRINT("enter", ("mutex: %s  parent: %s",
+                       new_mutex->name, parent_mutex->name));
+
+  fprintf(stderr, "safe_mutex: Found wrong usage of mutex "
+          "'%s' and '%s'\n",
+          parent_mutex->name, new_mutex->name);
+  DBUG_PRINT("info", ("safe_mutex: Found wrong usage of mutex "
+                      "'%s' and '%s'",
+                      parent_mutex->name, new_mutex->name));
+  fprintf(stderr, "Mutex currently locked (in reverse order):\n");
+  DBUG_PRINT("info", ("Mutex currently locked (in reverse order):"));
+  fprintf(stderr, "%-32.32s  %s  line %u\n", new_mutex->name, new_mutex->file,
+          new_mutex->line);
+  DBUG_PRINT("info", ("%-32.32s  %s  line %u\n", new_mutex->name,
+                      new_mutex->file, new_mutex->line));
+  for (mutex_root= *my_thread_var_mutex_in_use() ;
+       mutex_root;
+       mutex_root= mutex_root->next)
+  {
+    fprintf(stderr, "%-32.32s  %s  line %u\n", mutex_root->name,
+            mutex_root->file, mutex_root->line);
+    DBUG_PRINT("info", ("%-32.32s  %s  line %u", mutex_root->name,
+                        mutex_root->file, mutex_root->line));
+  }
+  fflush(stderr);
+  DBUG_VOID_RETURN;
+}
+
+
 #endif /* THREAD && SAFE_MUTEX */
 
 #if defined(THREAD) && defined(MY_PTHREAD_FASTMUTEX) && !defined(SAFE_MUTEX)
@@ -495,4 +935,5 @@ void fastmutex_global_init(void)
 #endif
 }
   
-#endif /* defined(THREAD) && defined(MY_PTHREAD_FASTMUTEX) && !defined(SAFE_MUTEX) */ 
+#endif /* SAFE_MUTEX_DEFINED */
+#endif /* THREAD */
diff --git a/mysys/thr_rwlock.c b/mysys/thr_rwlock.c
index 0aa4d3fc3c4..280a0ec19e7 100644
--- a/mysys/thr_rwlock.c
+++ b/mysys/thr_rwlock.c
@@ -89,7 +89,7 @@ int my_rw_rdlock(rw_lock_t *rwp)
   pthread_mutex_lock(&rwp->lock);
 
   /* active or queued writers */
-  while (( rwp->state < 0 ) || rwp->waiters)
+  while ((rwp->state < 0 ) || rwp->waiters)
     pthread_cond_wait( &rwp->readers, &rwp->lock);
 
   rwp->state++;
diff --git a/mysys/typelib.c b/mysys/typelib.c
index e745a9fb917..ff5dc1231e4 100644
--- a/mysys/typelib.c
+++ b/mysys/typelib.c
@@ -22,7 +22,7 @@
 
 static const char field_separator=',';
 
-int find_type_or_exit(const char *x, TYPELIB *typelib, const char *option)
+int find_type_with_warning(const char *x, TYPELIB *typelib, const char *option)
 {
   int res;
   const char **ptr;
@@ -38,12 +38,20 @@ int find_type_or_exit(const char *x, TYPELIB *typelib, const char *option)
     while (*++ptr)
       fprintf(stderr, ",'%s'", *ptr);
     fprintf(stderr, "\n");
-    exit(1);
   }
   return res;
 }
 
 
+uint find_type_or_exit(const char *x, TYPELIB *typelib, const char *option)
+{
+  int res;
+  if ((res= find_type_with_warning(x, typelib, option)) <= 0)
+    exit(1);
+  return (uint) res;
+}
+
+
 /*
   Search after a string in a list of strings. Endspace in x is not compared.
 
diff --git a/mysys/waiting_threads.c b/mysys/waiting_threads.c
new file mode 100644
index 00000000000..732929f6d99
--- /dev/null
+++ b/mysys/waiting_threads.c
@@ -0,0 +1,1153 @@
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+
+  "waiting threads" subsystem - a unified interface for threads to wait
+  on each other, with built-in deadlock detection.
+
+  Main concepts
+  ^^^^^^^^^^^^^
+  a thread - is represented by a WT_THD structure. One physical thread
+    can have only one WT_THD descriptor at any given moment.
+
+  a resource - a thread does not wait for other threads directly,
+    instead it waits for a "resource", which is "owned" by other threads.
+    It waits, exactly, for all "owners" to "release" a resource.
+    It does not have to correspond to a physical resource. For example, it
+    may be convenient in certain cases to force resource == thread.
+    A resource is represented by a WT_RESOURCE structure. 
+
+  a resource identifier - a pair of {resource type, value}. A value is
+    an ulonglong number. Represented by a WT_RESOURCE_ID structure.
+
+  a resource type - a pointer to a statically defined instance of
+    WT_RESOURCE_TYPE structure. This structure contains a pointer to
+    a function that knows how to compare values of this resource type.
+    In the simple case it could be wt_resource_id_memcmp().
+
+  a wait-for graph - a graph, that represenst "wait-for" relationships.
+    It has two types of nodes - threads and resources. There are directed
+    edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
+    from a thread to resources that it "owns" (WT_THD::my_resources),
+    and from a resource to threads that "own" it (WT_RESOURCE::owners)
+
+  Graph completeness
+  ^^^^^^^^^^^^^^^^^^
+
+  For flawless deadlock detection wait-for graph must be complete.
+  It means that when a thread starts waiting it needs to know *all* its
+  blockers, and call wt_thd_will_wait_for() for every one of them.
+  Otherwise two phenomena should be expected:
+
+  1. Fuzzy timeouts:
+
+    thread A needs to get a lock, and is blocked by a thread B.
+    it waits.
+    Just before the timeout thread B releases the lock.
+    thread A is ready to grab the lock but discovers that it is also
+    blocked by a thread C.
+    It waits and times out.
+
+    As a result thread A has waited two timeout intervals, instead of one.
+
+  2. Unreliable cycle detection:
+
+     Thread A waits for threads B and C
+     Thread C waits for D
+     Thread D wants to start waiting for A
+
+     one can see immediately that thread D creates a cycle, and thus
+     a deadlock is detected.
+
+     But if thread A would only wait for B, and start waiting for C
+     when B would unlock, thread D would be allowed to wait, a deadlock
+     would be only detected when B unlocks or somebody times out.
+
+  These two phenomena don't affect a correctness, and strictly speaking,
+  the caller is not required to call wt_thd_will_wait_for() for *all*
+  blockers - it may optimize wt_thd_will_wait_for() calls. But they
+  may be perceived as bugs by users, it must be understood that such
+  an optimization comes with its price.
+
+  Usage
+  ^^^^^
+
+  First, the wt* subsystem must be initialized by calling
+  wt_init(). In the server you don't need to do it, it's done
+  in mysqld.cc.
+
+  Similarly, wt_end() frees wt* structures, should be called
+  at the end, but in the server mysqld.cc takes care of that.
+
+  Every WT_THD should be initialized with wt_thd_lazy_init().
+  After that they can be used in other wt_thd_* calls.
+  Before discarding, WT_THD should be free'd with
+  wt_thd_destroy(). In the server both are handled in sql_class.cc,
+  it's an error to try to do it manually.
+
+  To use the deadlock detection one needs to use this thread's WT_THD,
+  call wt_thd_will_wait_for() for every thread it needs to wait on,
+  then call wt_thd_cond_timedwait(). When thread releases a resource
+  it should call wt_thd_release() (or wt_thd_release_all()) - it will
+  notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
+  if appropriate.
+
+  Just like with pthread's cond_wait, there could be spurious
+  wake-ups from wt_thd_cond_timedwait(). A caller is expected to
+  handle that (that is, to re-check the blocking criteria).
+
+  wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
+  WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
+  WT_TIMEOUT. Out of memory and other fatal errors are reported as
+  WT_DEADLOCK - and a transaction must be aborted just the same.
+
+  Configuration
+  ^^^^^^^^^^^^^
+  There are four config variables. Two deadlock search depths - short and
+  long - and two timeouts. Deadlock search is performed with the short
+  depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
+  waits with a short timeout, performs a deadlock search with the long
+  depth, and waits with a long timeout. As most deadlock cycles are supposed
+  to be short, most deadlocks will be detected at once, and waits will
+  rarely be necessary.
+
+  These config variables are thread-local. Different threads may have
+  different search depth and timeout values.
+
+  Also, deadlock detector supports different killing strategies, the victim
+  in a deadlock cycle is selected based on the "weight". See "weight"
+  description in waiting_threads.h for details. It's up to the caller to
+  set weights accordingly.
+
+  Status
+  ^^^^^^
+  We calculate the number of successfull waits (WT_OK returned from
+  wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
+  length distribution - number of deadlocks with every length from
+  1 to WT_CYCLE_STATS, and a wait time distribution - number
+  of waits with a time from 1 us to 1 min in WT_WAIT_STATS
+  intervals on a log e scale.
+*/
+
+/*
+  Note that if your lock system satisfy the following condition:
+
+    there exist four lock levels A, B, C, D, such as
+      A is compatible with B
+      A is not compatible with C
+      D is not compatible with B
+
+      (example A=IX, B=IS, C=S, D=X)
+
+   you need to include lock level in the resource identifier - a
+   thread waiting for lock of the type A on resource R and another
+   thread waiting for lock of the type B on resource R should wait on
+   different WT_RESOURCE structures, on different {lock, resource}
+   pairs.  Otherwise the following is possible:
+
+      thread1> take S-lock on R
+      thread2> take IS-lock on R
+      thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
+      thread3 is killed (or timeout or whatever)
+      WT_RESOURCE structure for R is still in the hash, as it has two owners
+      thread4> wants an IX-lock on R
+      WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
+      !! now thread4 is waiting for both thread1 and thread2
+      !! while, in fact, IX-lock and IS-lock are compatible and
+      !! thread4 should not wait for thread2.
+*/
+
+#include <waiting_threads.h>
+#include <m_string.h>
+
+/* status variables */
+
+/**
+  preset table of wait intervals
+*/
+ulonglong wt_wait_table[WT_WAIT_STATS];
+/**
+  wait time distribution (log e scale)
+*/
+uint32 wt_wait_stats[WT_WAIT_STATS+1];
+/**
+  distribution of cycle lengths
+  first column tells whether this was during short or long detection
+*/
+uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
+uint32 wt_success_stats;
+
+static my_atomic_rwlock_t cycle_stats_lock, wait_stats_lock, success_stats_lock;
+
+#ifdef SAFE_STATISTICS
+#define incr(VAR, LOCK)                           \
+  do {                                            \
+    my_atomic_rwlock_wrlock(&(LOCK));             \
+    my_atomic_add32(&(VAR), 1);                   \
+    my_atomic_rwlock_wrunlock(&(LOCK));           \
+  } while(0)
+#else
+#define incr(VAR,LOCK)  do { (VAR)++; } while(0)
+#endif
+
+static void increment_success_stats()
+{
+  incr(wt_success_stats, success_stats_lock);
+}
+
+static void increment_cycle_stats(uint depth, uint slot)
+{
+  if (depth >= WT_CYCLE_STATS)
+    depth= WT_CYCLE_STATS;
+  incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
+}
+
+static void increment_wait_stats(ulonglong waited,int ret)
+{
+  uint i;
+  if ((ret) == ETIMEDOUT)
+    i= WT_WAIT_STATS;
+  else
+    for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
+  incr(wt_wait_stats[i], wait_stats_lock);
+}
+
+/*
+  'lock' protects 'owners', 'state', and 'waiter_count'
+  'id' is read-only
+
+  a resource is picked up from a hash in a lock-free manner
+  it's returned pinned, so it cannot be freed at once
+  but it may be freed right after the pin is removed
+  to free a resource it should
+    1. have no owners
+    2. have no waiters
+
+  two ways to access a resource:
+    1. find it in a hash
+       - it's returned pinned.
+        a) take a lock in exclusive mode
+        b) check the state, it should be ACTIVE to be usable
+        c) unpin
+    2. by a direct reference
+       - could only used if a resource cannot be freed
+       e.g. accessing a resource by thd->waiting_for is safe,
+       a resource cannot be freed as there's a thread waiting for it
+*/
+struct st_wt_resource {
+  WT_RESOURCE_ID  id;
+  uint            waiter_count;
+  enum { ACTIVE, FREE } state;
+#ifndef DBUG_OFF
+  pthread_mutex_t  *cond_mutex; /* a mutex for the 'cond' below */
+#endif
+  /*
+    before the 'lock' all elements are mutable, after (and including) -
+    immutable in the sense that lf_hash_insert() won't memcpy() over them.
+    See wt_init().
+  */
+#ifdef WT_RWLOCKS_USE_MUTEXES
+  /*
+    we need a special rwlock-like 'lock' to allow readers bypass
+    waiting writers, otherwise readers can deadlock. For example:
+
+      A waits on resource x, owned by B, B waits on resource y, owned
+      by A, we have a cycle (A->x->B->y->A)
+      Both A and B start deadlock detection:
+
+        A locks x                          B locks y
+        A goes deeper                      B goes deeper
+        A locks y                          B locks x
+
+      with mutexes it would deadlock. With rwlocks it won't, as long
+      as both A and B are taking read locks (and they do).
+      But other threads may take write locks. Assume there's
+      C who wants to start waiting on x, and D who wants to start
+      waiting on y.
+
+        A read-locks x                       B read-locks y
+        A goes deeper                        B goes deeper
+     => C write-locks x (to add a new edge)  D write-locks y
+     .. C is blocked                         D is blocked
+        A read-locks y                       B read-locks x
+
+      Now, if a read lock can bypass a pending wrote lock request, we're fine.
+      If it can not, we have a deadlock.
+
+    writer starvation is technically possible, but unlikely, because
+    the contention is expected to be low.
+  */
+  struct {
+    pthread_cond_t   cond;
+    pthread_mutex_t  mutex;
+    uint readers: 16;
+    uint pending_writers: 15;
+    uint write_locked: 1;
+  } lock;
+#else
+  rw_lock_t lock;
+#endif
+  pthread_cond_t   cond; /* the corresponding mutex is provided by the caller */
+  DYNAMIC_ARRAY    owners;
+};
+
+#ifdef  WT_RWLOCKS_USE_MUTEXES
+static void rc_rwlock_init(WT_RESOURCE *rc)
+{
+  pthread_cond_init(&rc->lock.cond, 0);
+  pthread_mutex_init(&rc->lock.mutex, MY_MUTEX_INIT_FAST);
+}
+static void rc_rwlock_destroy(WT_RESOURCE *rc)
+{
+  DBUG_ASSERT(rc->lock.write_locked == 0);
+  DBUG_ASSERT(rc->lock.readers == 0);
+  pthread_cond_destroy(&rc->lock.cond);
+  pthread_mutex_destroy(&rc->lock.mutex);
+}
+static void rc_rdlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
+  pthread_mutex_lock(&rc->lock.mutex);
+  while (rc->lock.write_locked)
+    pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
+  rc->lock.readers++;
+  pthread_mutex_unlock(&rc->lock.mutex);
+  DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
+}
+static void rc_wrlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
+  pthread_mutex_lock(&rc->lock.mutex);
+  while (rc->lock.write_locked || rc->lock.readers)
+    pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
+  rc->lock.write_locked= 1;
+  pthread_mutex_unlock(&rc->lock.mutex);
+  DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
+}
+static void rc_unlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
+  pthread_mutex_lock(&rc->lock.mutex);
+  if (rc->lock.write_locked)
+  {
+    rc->lock.write_locked= 0;
+    pthread_cond_broadcast(&rc->lock.cond);
+  }
+  else if (--rc->lock.readers == 0)
+    pthread_cond_broadcast(&rc->lock.cond);
+  pthread_mutex_unlock(&rc->lock.mutex);
+}
+#else
+static void rc_rwlock_init(WT_RESOURCE *rc)
+{
+  my_rwlock_init(&rc->lock, 0);
+}
+static void rc_rwlock_destroy(WT_RESOURCE *rc)
+{
+  rwlock_destroy(&rc->lock);
+}
+static void rc_rdlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
+  rw_rdlock(&rc->lock);
+  DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
+}
+static void rc_wrlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
+  rw_wrlock(&rc->lock);
+  DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
+}
+static void rc_unlock(WT_RESOURCE *rc)
+{
+  DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
+  rw_unlock(&rc->lock);
+}
+#endif
+
+/*
+  All resources are stored in a lock-free hash. Different threads
+  may add new resources and perform deadlock detection concurrently.
+*/
+static LF_HASH      reshash;
+
+/**
+  WT_RESOURCE constructor
+
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
+  WT_RESOURCE is located at arg+sizeof(LF_SLIST)
+*/
+static void wt_resource_init(uchar *arg)
+{
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  DBUG_ENTER("wt_resource_init");
+
+  bzero(rc, sizeof(*rc));
+  rc_rwlock_init(rc);
+  pthread_cond_init(&rc->cond, 0);
+  my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5);
+  DBUG_VOID_RETURN;
+}
+
+/**
+  WT_RESOURCE destructor
+
+  It's called from lf_hash and takes a pointer to an LF_SLIST instance.
+  WT_RESOURCE is located at arg+sizeof(LF_SLIST)
+*/
+static void wt_resource_destroy(uchar *arg)
+{
+  WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+  DBUG_ENTER("wt_resource_destroy");
+
+  DBUG_ASSERT(rc->owners.elements == 0);
+  rc_rwlock_destroy(rc);
+  pthread_cond_destroy(&rc->cond);
+  delete_dynamic(&rc->owners);
+  DBUG_VOID_RETURN;
+}
+
+void wt_init()
+{
+  DBUG_ENTER("wt_init");
+  DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
+
+  lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
+               sizeof_WT_RESOURCE_ID, 0, 0);
+  reshash.alloc.constructor= wt_resource_init;
+  reshash.alloc.destructor= wt_resource_destroy;
+  /*
+    Note a trick: we initialize the hash with the real element size,
+    but fix it later to a shortened element size. This way
+    the allocator will allocate elements correctly, but
+    lf_hash_insert() will only overwrite part of the element with memcpy().
+    lock, condition, and dynamic array will be intact.
+  */
+  reshash.element_size= offsetof(WT_RESOURCE, lock);
+  bzero(wt_wait_stats, sizeof(wt_wait_stats));
+  bzero(wt_cycle_stats, sizeof(wt_cycle_stats));
+  wt_success_stats= 0;
+  { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
+    int i;
+    double from= log(1);   /* 1 us */
+    double to= log(60e6);  /* 1 min */
+    for (i= 0; i < WT_WAIT_STATS; i++)
+    {
+      wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
+      DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
+    }
+  }
+  my_atomic_rwlock_init(&cycle_stats_lock);
+  my_atomic_rwlock_init(&success_stats_lock);
+  my_atomic_rwlock_init(&wait_stats_lock);
+  DBUG_VOID_RETURN;
+}
+
+void wt_end()
+{
+  DBUG_ENTER("wt_end");
+
+  DBUG_ASSERT(reshash.count == 0);
+  lf_hash_destroy(&reshash);
+  my_atomic_rwlock_destroy(&cycle_stats_lock);
+  my_atomic_rwlock_destroy(&success_stats_lock);
+  my_atomic_rwlock_destroy(&wait_stats_lock);
+  DBUG_VOID_RETURN;
+}
+
+/**
+  Lazy WT_THD initialization
+
+  Cheap initialization of WT_THD. Only initialize fields that don't require
+  memory allocations - basically, it only does assignments. The rest of the
+  WT_THD structure will be initialized on demand, on the first use.
+  This allows one to initialize lazily all WT_THD structures, even if some
+  (or even most) of them will never be used for deadlock detection.
+
+  @param ds     a pointer to deadlock search depth short value
+  @param ts     a pointer to deadlock timeout short value
+  @param dl     a pointer to deadlock search depth long value
+  @param tl     a pointer to deadlock timeout long value
+
+  @note these are pointers to values, and WT_THD stores them as pointers.
+  It allows one later to change search depths and timeouts for existing
+  threads. It also means that the pointers must stay valid for the lifetime
+  of WT_THD.
+*/
+void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
+                                   const ulong *dl, const ulong *tl)
+{
+  DBUG_ENTER("wt_thd_lazy_init");
+  thd->waiting_for= 0;
+  thd->weight= 0;
+  thd->deadlock_search_depth_short= ds;
+  thd->timeout_short= ts;
+  thd->deadlock_search_depth_long= dl;
+  thd->timeout_long= tl;
+  /* dynamic array is also initialized lazily - without memory allocations */
+  my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
+#ifndef DBUG_OFF
+  thd->name= my_thread_name();
+#endif
+  DBUG_VOID_RETURN;
+}
+
+/**
+  Finalize WT_THD initialization
+
+  After lazy WT_THD initialization, parts of the structure are still
+  uninitialized. This function completes the initialization, allocating
+  memory, if necessary. It's called automatically on demand, when WT_THD
+  is about to be used.
+*/
+static int fix_thd_pins(WT_THD *thd)
+{
+  if (unlikely(thd->pins == 0))
+  {
+    thd->pins= lf_hash_get_pins(&reshash);
+#ifndef DBUG_OFF
+    thd->name= my_thread_name();
+#endif
+  }
+  return thd->pins == 0;
+}
+
+void wt_thd_destroy(WT_THD *thd)
+{
+  DBUG_ENTER("wt_thd_destroy");
+
+  DBUG_ASSERT(thd->my_resources.elements == 0);
+  DBUG_ASSERT(thd->waiting_for == 0);
+
+  if (thd->pins != 0)
+    lf_hash_put_pins(thd->pins);
+
+  delete_dynamic(&thd->my_resources);
+  DBUG_VOID_RETURN;
+}
+/**
+  Trivial resource id comparison function - bytewise memcmp.
+
+  It can be used in WT_RESOURCE_TYPE structures where bytewise
+  comparison of values is sufficient.
+*/
+my_bool wt_resource_id_memcmp(const void *a, const void *b)
+{
+  /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
+  compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
+  return memcmp(a, b, sizeof_WT_RESOURCE_ID);
+}
+
+/**
+  arguments for the recursive deadlock_search function
+*/
+struct deadlock_arg {
+  WT_THD * const thd;          /**< starting point of a search */
+  uint const max_depth;        /**< search depth limit */
+  WT_THD *victim;              /**< a thread to be killed to resolve a deadlock */
+  WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
+};
+
+/**
+  helper function to change the victim, according to the weight
+*/
+static void change_victim(WT_THD* found, struct deadlock_arg *arg)
+{
+  if (found->weight < arg->victim->weight)
+  {
+    if (arg->victim != arg->thd)
+    {
+      rc_unlock(arg->victim->waiting_for); /* release the previous victim */
+      DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
+    }
+    arg->victim= found;
+    arg->last_locked_rc= 0;
+  }
+}
+
+/**
+  recursive loop detection in a wait-for graph with a limited search depth
+*/
+static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
+                           uint depth)
+{
+  WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
+  WT_THD *cursor;
+  uint i;
+  int ret= WT_OK;
+  DBUG_ENTER("deadlock_search");
+  DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
+                    arg->thd->name, blocker->name, depth));
+
+  LF_REQUIRE_PINS(1);
+
+  arg->last_locked_rc= 0;
+
+  if (depth > arg->max_depth)
+  {
+    DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
+    DBUG_RETURN(WT_DEPTH_EXCEEDED);
+  }
+
+retry:
+  /*
+    safe dereference as explained in lf_alloc-pin.c
+    (in short: protects against lf_alloc_free() in lf_hash_delete())
+  */
+  do
+  {
+    rc= *shared_ptr;
+    lf_pin(arg->thd->pins, 0, rc);
+  } while (rc != *shared_ptr && LF_BACKOFF);
+
+  if (rc == 0)
+  {
+    DBUG_PRINT("wt", ("exit: OK (early)"));
+    DBUG_RETURN(0);
+  }
+
+  rc_rdlock(rc);
+  if (rc->state != ACTIVE || *shared_ptr != rc)
+  {
+    /* blocker is not waiting on this resource anymore */
+    rc_unlock(rc);
+    lf_unpin(arg->thd->pins, 0);
+    goto retry;
+  }
+  /* as the state is locked, we can unpin now */
+  lf_unpin(arg->thd->pins, 0);
+
+  /*
+    Below is not a pure depth-first search. It's a depth-first with a
+    slightest hint of breadth-first. Depth-first is:
+
+      check(element, X):
+        foreach current in element->nodes[] do:
+          if current == X return error;
+          check(current, X);
+
+    while we do
+
+      check(element, X):
+        foreach current in element->nodes[] do:
+          if current == X return error;
+        foreach current in element->nodes[] do:
+          check(current, X);
+
+    preferring shorter deadlocks over longer ones.
+  */
+  for (i= 0; i < rc->owners.elements; i++)
+  {
+    cursor= *dynamic_element(&rc->owners, i, WT_THD**);
+    /*
+      We're only looking for (and detecting) cycles that include 'arg->thd'.
+      That is, only deadlocks that *we* have created. For example,
+        thd->A->B->thd
+      (thd waits for A, A waits for B, while B is waiting for thd).
+      While walking the graph we can encounter other cicles, e.g.
+        thd->A->B->C->A
+      This will not be detected. Instead we will walk it in circles until
+      the search depth limit is reached (the latter guarantees that an
+      infinite loop is impossible). We expect the thread that has created
+      the cycle (one of A, B, and C) to detect its deadlock.
+    */
+    if (cursor == arg->thd)
+    {
+      ret= WT_DEADLOCK;
+      increment_cycle_stats(depth, arg->max_depth ==
+                                   *arg->thd->deadlock_search_depth_long);
+      arg->victim= cursor;
+      goto end;
+    }
+  }
+  for (i= 0; i < rc->owners.elements; i++)
+  {
+    cursor= *dynamic_element(&rc->owners, i, WT_THD**);
+    switch (deadlock_search(arg, cursor, depth+1)) {
+    case WT_OK:
+      break;
+    case WT_DEPTH_EXCEEDED:
+      ret= WT_DEPTH_EXCEEDED;
+      break;
+    case WT_DEADLOCK:
+      ret= WT_DEADLOCK;
+      change_victim(cursor, arg);       /* also sets arg->last_locked_rc to 0 */
+      i= rc->owners.elements;           /* jump out of the loop */
+      break;
+    default:
+      DBUG_ASSERT(0);
+    }
+    if (arg->last_locked_rc)
+      rc_unlock(arg->last_locked_rc);
+  }
+end:
+  /*
+    Note that 'rc' is locked in this function, but it's never unlocked here.
+    Instead it's saved in arg->last_locked_rc and the *caller* is
+    expected to unlock it.  It's done to support different killing
+    strategies. This is how it works:
+    Assuming a graph
+
+      thd->A->B->C->thd
+
+    deadlock_search() function starts from thd, locks it (in fact it locks not
+    a thd, but a resource it is waiting on, but below, for simplicity, I'll
+    talk about "locking a thd"). Then it goes down recursively, locks A, and so
+    on. Goes down recursively, locks B. Goes down recursively, locks C.
+    Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
+    Returns from the last deadlock_search() call. C stays locked!
+    Now it checks whether C is a more appropriate victim than 'thd'.
+    If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
+    Now it checks whether B is a more appropriate victim than arg->victim.
+    If yes - old arg->victim is unlocked and arg->victim=B,
+    otherwise B is unlocked. Return.
+    And so on.
+
+    In short, a resource is locked in a frame. But it's not unlocked in the
+    same frame, it's unlocked by the caller, and only after the caller checks
+    that it doesn't need to use current WT_THD as a victim. If it does - the
+    lock is kept and the old victim's resource is unlocked. When the recursion
+    is unrolled and we are back to deadlock() function, there are only two
+    locks left - on thd and on the victim.
+  */
+  arg->last_locked_rc= rc;
+  DBUG_PRINT("wt", ("exit: %s",
+                    ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
+                    ret ? "WT_DEADLOCK" : "OK"));
+  DBUG_RETURN(ret);
+}
+
+/**
+  Deadlock detection in a wait-for graph
+
+  A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
+  invokes deadlock_search(), increments statistics, notifies the victim.
+
+  @param thd            thread that is going to wait. Deadlock is detected
+                        if, while walking the graph, we reach a thread that
+                        is waiting on thd
+  @param blocker        starting point of a search. In wt_thd_cond_timedwait()
+                        it's thd, in wt_thd_will_wait_for() it's a thread that
+                        thd is going to wait for
+  @param depth          starting search depth. In general it's the number of
+                        edges in the wait-for graph between thd and the
+                        blocker. Practically only two values are used (and
+                        supported) - when thd == blocker it's 0, when thd
+                        waits directly for blocker, it's 1
+  @param max_depth      search depth limit
+*/
+static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
+                            uint max_depth)
+{
+  struct deadlock_arg arg= {thd, max_depth, 0, 0};
+  int ret;
+  DBUG_ENTER("deadlock");
+  DBUG_ASSERT(depth < 2);
+  ret= deadlock_search(&arg, blocker, depth);
+  if (ret == WT_DEPTH_EXCEEDED)
+  {
+    increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
+                                          *thd->deadlock_search_depth_long);
+    ret= WT_OK;
+  }
+  /*
+    if we started with depth==1, blocker was never considered for a victim
+    in deadlock_search(). Do it here.
+  */
+  if (ret == WT_DEADLOCK && depth)
+    change_victim(blocker, &arg);
+  if (arg.last_locked_rc)
+  {
+    /*
+      Special return code if there's nobody to wait for.
+
+      depth == 0 means that we start the search from thd (thd == blocker).
+      ret == WT_OK means that no cycle was found and
+        arg.last_locked_rc == thd->waiting_for.
+      and arg.last_locked_rc->owners.elements == 0 means that
+        (applying the rule above) thd->waiting_for->owners.elements == 0,
+        and thd doesn't have anybody to wait for.
+    */
+    if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
+    {
+      DBUG_ASSERT(thd == blocker);
+      DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
+      ret= WT_FREE_TO_GO;
+    }
+    rc_unlock(arg.last_locked_rc);
+  }
+  /* notify the victim, if appropriate */
+  if (ret == WT_DEADLOCK && arg.victim != thd)
+  {
+    DBUG_PRINT("wt", ("killing %s", arg.victim->name));
+    arg.victim->killed= 1;
+    pthread_cond_broadcast(&arg.victim->waiting_for->cond);
+    rc_unlock(arg.victim->waiting_for);
+    ret= WT_OK;
+  }
+  DBUG_RETURN(ret);
+}
+
+
+/**
+  Delete an element from reshash if it has no waiters or owners
+
+  rc->lock must be locked by the caller and it's unlocked on return.
+*/
+static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
+{
+  uint keylen;
+  const void *key;
+  DBUG_ENTER("unlock_lock_and_free_resource");
+
+  DBUG_ASSERT(rc->state == ACTIVE);
+
+  if (rc->owners.elements || rc->waiter_count)
+  {
+    DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
+                      rc->owners.elements, rc->waiter_count));
+    rc_unlock(rc);
+    DBUG_RETURN(0);
+  }
+
+  if (fix_thd_pins(thd))
+  {
+    rc_unlock(rc);
+    DBUG_RETURN(1);
+  }
+
+  /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
+  {
+    key= &rc->id;
+    keylen= sizeof_WT_RESOURCE_ID;
+  }
+
+  /*
+    To free the element correctly we need to:
+     1. take its lock (already done).
+     2. set the state to FREE
+     3. release the lock
+     4. remove from the hash
+  */
+  rc->state= FREE;
+  rc_unlock(rc);
+  DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
+}
+
+
+/**
+  register the fact that thd is not waiting anymore
+
+  decrease waiter_count, clear waiting_for, free the resource if appropriate.
+  thd->waiting_for must be locked!
+*/
+static int stop_waiting_locked(WT_THD *thd)
+{
+  int ret;
+  WT_RESOURCE *rc= thd->waiting_for;
+  DBUG_ENTER("stop_waiting_locked");
+
+  DBUG_ASSERT(rc->waiter_count);
+  DBUG_ASSERT(rc->state == ACTIVE);
+  rc->waiter_count--;
+  thd->waiting_for= 0;
+  ret= unlock_lock_and_free_resource(thd, rc);
+  DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
+}
+
+/**
+  register the fact that thd is not waiting anymore
+
+  locks thd->waiting_for and calls stop_waiting_locked().
+*/
+static int stop_waiting(WT_THD *thd)
+{
+  int ret;
+  WT_RESOURCE *rc= thd->waiting_for;
+  DBUG_ENTER("stop_waiting");
+
+  if (!rc)
+    DBUG_RETURN(WT_OK);
+  /*
+    nobody's trying to free the resource now,
+    as its waiter_count is guaranteed to be non-zero
+  */
+  rc_wrlock(rc);
+  ret= stop_waiting_locked(thd);
+  DBUG_RETURN(ret);
+}
+
+/**
+  notify the system that a thread needs to wait for another thread
+
+  called by a *waiter* to declare that it (thd) will wait for another
+  thread (blocker) on a specific resource (resid).
+  can be called many times, if many blockers own a blocking resource.
+  but must always be called with the same resource id - a thread cannot
+  wait for more than one resource at a time.
+
+  @return WT_OK or WT_DEADLOCK
+
+  As a new edge is added to the wait-for graph, a deadlock detection is
+  performed for this new edge.
+*/
+int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
+                         const WT_RESOURCE_ID *resid)
+{
+  uint i;
+  WT_RESOURCE *rc;
+  DBUG_ENTER("wt_thd_will_wait_for");
+
+  LF_REQUIRE_PINS(3);
+
+  DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
+                    thd->name, blocker->name, (ulong)resid->value));
+
+  if (fix_thd_pins(thd))
+    DBUG_RETURN(WT_DEADLOCK);
+
+  if (thd->waiting_for == 0)
+  {
+    uint keylen;
+    const void *key;
+    /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
+    {
+      key= resid;
+      keylen= sizeof_WT_RESOURCE_ID;
+    }
+
+    DBUG_PRINT("wt", ("first blocker"));
+
+retry:
+    while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
+    {
+      WT_RESOURCE tmp;
+
+      DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
+      bzero(&tmp, sizeof(tmp));
+      tmp.id= *resid;
+      tmp.state= ACTIVE;
+
+      if (lf_hash_insert(&reshash, thd->pins, &tmp) == -1) /* if OOM */
+        DBUG_RETURN(WT_DEADLOCK);
+      /*
+        Two cases: either lf_hash_insert() failed - because another thread
+        has just inserted a resource with the same id - and we need to retry.
+        Or lf_hash_insert() succeeded, and then we need to repeat
+        lf_hash_search() to find a real address of the newly inserted element.
+        That is, we don't care what lf_hash_insert() has returned.
+        And we need to repeat the loop anyway.
+      */
+    }
+    if (rc == MY_ERRPTR)
+      DBUG_RETURN(WT_DEADLOCK);
+
+    DBUG_PRINT("wt", ("found in hash rc=%p", rc));
+
+    rc_wrlock(rc);
+    if (rc->state != ACTIVE)
+    {
+      DBUG_PRINT("wt", ("but it's not active, retrying"));
+      /* Somebody has freed the element while we weren't looking */
+      rc_unlock(rc);
+      lf_hash_search_unpin(thd->pins);
+      goto retry;
+    }
+
+    lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
+    thd->waiting_for= rc;
+    rc->waiter_count++;
+    thd->killed= 0;
+  }
+  else
+  {
+    DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
+    DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
+    DBUG_PRINT("wt", ("adding another blocker"));
+
+    /*
+      we can safely access the resource here, it's in the hash as it has
+      non-zero waiter_count
+    */
+    rc= thd->waiting_for;
+    rc_wrlock(rc);
+    DBUG_ASSERT(rc->waiter_count);
+    DBUG_ASSERT(rc->state == ACTIVE);
+
+    if (thd->killed)
+    {
+      stop_waiting_locked(thd);
+      DBUG_RETURN(WT_DEADLOCK);
+    }
+  }
+  /*
+    Another thread could be waiting on this resource for this very 'blocker'.
+    In this case we should not add it to the list for the second time.
+  */
+  for (i= 0; i < rc->owners.elements; i++)
+    if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
+      break;
+  if (i >= rc->owners.elements)
+  {
+    if (push_dynamic(&blocker->my_resources, (void*)&rc))
+    {
+      stop_waiting_locked(thd);
+      DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
+    }
+    if (push_dynamic(&rc->owners, (void*)&blocker))
+    {
+      pop_dynamic(&blocker->my_resources);
+      stop_waiting_locked(thd);
+      DBUG_RETURN(WT_DEADLOCK);
+    }
+  }
+  rc_unlock(rc);
+
+  if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
+  {
+    stop_waiting(thd);
+    DBUG_RETURN(WT_DEADLOCK);
+  }
+  DBUG_RETURN(WT_OK);
+}
+
+/**
+  called by a *waiter* (thd) to start waiting
+
+  It's supposed to be a drop-in replacement for
+  pthread_cond_timedwait(), and it takes mutex as an argument.
+
+  @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
+*/
+int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex)
+{
+  int ret= WT_TIMEOUT;
+  struct timespec timeout;
+  ulonglong before, after, starttime;
+  WT_RESOURCE *rc= thd->waiting_for;
+  DBUG_ENTER("wt_thd_cond_timedwait");
+  DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
+
+#ifndef DBUG_OFF
+  if (rc->cond_mutex)
+    DBUG_ASSERT(rc->cond_mutex == mutex);
+  else
+    rc->cond_mutex= mutex;
+  safe_mutex_assert_owner(mutex);
+#endif
+
+  before= starttime= my_getsystime();
+
+#ifdef __WIN__
+  /*
+    only for the sake of Windows we distinguish between
+    'before' and 'starttime':
+
+    my_getsystime() returns high-resolution value, that cannot be used for
+    waiting (it doesn't follow system clock changes), but is good for time
+    intervals.
+
+    GetSystemTimeAsFileTime() follows system clock, but is low-resolution
+    and will result in lousy intervals.
+  */
+  GetSystemTimeAsFileTime((PFILETIME)&starttime);
+#endif
+
+  rc_wrlock(rc);
+  if (rc->owners.elements == 0)
+    ret= WT_OK;
+  rc_unlock(rc);
+
+  set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*ULL(1000));
+  if (ret == WT_TIMEOUT && !thd->killed)
+    ret= pthread_cond_timedwait(&rc->cond, mutex, &timeout);
+  if (ret == WT_TIMEOUT && !thd->killed)
+  {
+    int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
+    if (r == WT_FREE_TO_GO)
+      ret= WT_OK;
+    else if (r != WT_OK)
+      ret= WT_DEADLOCK;
+    else if (*thd->timeout_long > *thd->timeout_short)
+    {
+      set_timespec_time_nsec(timeout, starttime, (*thd->timeout_long)*ULL(1000));
+      if (!thd->killed)
+        ret= pthread_cond_timedwait(&rc->cond, mutex, &timeout);
+    }
+  }
+  after= my_getsystime();
+  if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
+    ret= WT_DEADLOCK;
+  increment_wait_stats(after-before, ret);
+  if (ret == WT_OK)
+    increment_success_stats();
+  DBUG_RETURN(ret);
+}
+
+/**
+  called by a *blocker* when it releases a resource
+
+  it's conceptually similar to pthread_cond_broadcast, and must be done
+  under the same mutex as wt_thd_cond_timedwait().
+
+  @param resid   a resource to release. 0 to release all resources
+*/
+
+void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
+{
+  uint i;
+  DBUG_ENTER("wt_thd_release");
+
+  for (i= 0; i < thd->my_resources.elements; i++)
+  {
+    WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
+    if (!resid || (resid->type->compare(&rc->id, resid) == 0))
+    {
+      uint j;
+
+      rc_wrlock(rc);
+      /*
+        nobody's trying to free the resource now,
+        as its owners[] array is not empty (at least thd must be there)
+      */
+      DBUG_ASSERT(rc->state == ACTIVE);
+      for (j= 0; j < rc->owners.elements; j++)
+        if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
+          break;
+      DBUG_ASSERT(j < rc->owners.elements);
+      delete_dynamic_element(&rc->owners, j);
+      if (rc->owners.elements == 0)
+      {
+        pthread_cond_broadcast(&rc->cond);
+#ifndef DBUG_OFF
+        if (rc->cond_mutex)
+          safe_mutex_assert_owner(rc->cond_mutex);
+#endif
+      }
+      unlock_lock_and_free_resource(thd, rc);
+      if (resid)
+      {
+        delete_dynamic_element(&thd->my_resources, i);
+        DBUG_VOID_RETURN;
+      }
+    }
+  }
+  if (!resid)
+    reset_dynamic(&thd->my_resources);
+  DBUG_VOID_RETURN;
+}
+
diff --git a/mysys/wqueue.c b/mysys/wqueue.c
new file mode 100644
index 00000000000..fcc0a39725d
--- /dev/null
+++ b/mysys/wqueue.c
@@ -0,0 +1,225 @@
+
+#include <wqueue.h>
+
+#define STRUCT_PTR(TYPE, MEMBER, a)                                           \
+          (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
+/*
+  Link a thread into double-linked queue of waiting threads.
+
+  SYNOPSIS
+    wqueue_link_into_queue()
+      wqueue              pointer to the queue structure
+      thread              pointer to the thread to be added to the queue
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Queue is represented by a circular list of the thread structures
+    The list is double-linked of the type (**prev,*next), accessed by
+    a pointer to the last element.
+*/
+
+void wqueue_link_into_queue(WQUEUE *wqueue, struct st_my_thread_var *thread)
+{
+  struct st_my_thread_var *last;
+  if (!(last= wqueue->last_thread))
+  {
+    /* Queue is empty */
+    thread->next= thread;
+    thread->prev= &thread->next;
+  }
+  else
+  {
+    thread->prev= last->next->prev;
+    last->next->prev= &thread->next;
+    thread->next= last->next;
+    last->next= thread;
+  }
+  wqueue->last_thread= thread;
+}
+
+
+/*
+  Add a thread to single-linked queue of waiting threads
+
+  SYNOPSIS
+    wqueue_add_to_queue()
+      wqueue              pointer to the queue structure
+      thread              pointer to the thread to be added to the queue
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Queue is represented by a circular list of the thread structures
+    The list is single-linked of the type (*next), accessed by a pointer
+    to the last element.
+*/
+
+void wqueue_add_to_queue(WQUEUE *wqueue, struct st_my_thread_var *thread)
+{
+  struct st_my_thread_var *last;
+  if (!(last= wqueue->last_thread))
+    thread->next= thread;
+  else
+  {
+    thread->next= last->next;
+    last->next= thread;
+  }
+#ifndef DBUG_OFF
+  thread->prev= NULL; /* force segfault if used */
+#endif
+  wqueue->last_thread= thread;
+}
+
+/*
+  Unlink a thread from double-linked queue of waiting threads
+
+  SYNOPSIS
+    wqueue_unlink_from_queue()
+      wqueue              pointer to the queue structure
+      thread              pointer to the thread to be removed from the queue
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    See NOTES for link_into_queue
+*/
+
+void wqueue_unlink_from_queue(WQUEUE *wqueue, struct st_my_thread_var *thread)
+{
+  if (thread->next == thread)
+    /* The queue contains only one member */
+    wqueue->last_thread= NULL;
+  else
+  {
+    thread->next->prev= thread->prev;
+    *thread->prev= thread->next;
+    if (wqueue->last_thread == thread)
+      wqueue->last_thread= STRUCT_PTR(struct st_my_thread_var, next,
+                                      thread->prev);
+  }
+  thread->next= NULL;
+}
+
+
+/*
+  Remove all threads from queue signaling them to proceed
+
+  SYNOPSIS
+    wqueue_realease_queue()
+      wqueue              pointer to the queue structure
+      thread              pointer to the thread to be added to the queue
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    See notes for add_to_queue
+    When removed from the queue each thread is signaled via condition
+    variable thread->suspend.
+*/
+
+void wqueue_release_queue(WQUEUE *wqueue)
+{
+  struct st_my_thread_var *last= wqueue->last_thread;
+  struct st_my_thread_var *next= last->next;
+  struct st_my_thread_var *thread;
+  do
+  {
+    thread= next;
+    pthread_cond_signal(&thread->suspend);
+    next= thread->next;
+    thread->next= NULL;
+  }
+  while (thread != last);
+  wqueue->last_thread= NULL;
+}
+
+
+/**
+  @brief Removes all threads waiting for read or first one waiting for write.
+
+  @param wqueue          pointer to the queue structure
+  @param thread          pointer to the thread to be added to the queue
+
+  @note This function is applicable only to single linked lists.
+*/
+
+void wqueue_release_one_locktype_from_queue(WQUEUE *wqueue)
+{
+  struct st_my_thread_var *last= wqueue->last_thread;
+  struct st_my_thread_var *next= last->next;
+  struct st_my_thread_var *thread;
+  struct st_my_thread_var *new_list= NULL;
+  uint first_type= next->lock_type;
+  if (first_type == MY_PTHREAD_LOCK_WRITE)
+  {
+    /* release first waiting for write lock */
+    pthread_cond_signal(&next->suspend);
+    if (next == last)
+      wqueue->last_thread= NULL;
+    else
+      last->next= next->next;
+    next->next= NULL;
+    return;
+  }
+  do
+  {
+    thread= next;
+    next= thread->next;
+    if (thread->lock_type == MY_PTHREAD_LOCK_WRITE)
+    {
+      /* skip waiting for write lock */
+      if (new_list)
+      {
+        thread->next= new_list->next;
+        new_list= new_list->next= thread;
+      }
+      else
+        new_list= thread->next= thread;
+    }
+    else
+    {
+      /* release waiting for read lock */
+      pthread_cond_signal(&thread->suspend);
+      thread->next= NULL;
+    }
+  } while (thread != last);
+  wqueue->last_thread= new_list;
+}
+
+
+/*
+  Add thread and wait
+
+  SYNOPSYS
+    wqueue_add_and_wait()
+    wqueue               queue to add to
+    thread               thread which is waiting
+    lock                 mutex need for the operation
+*/
+
+void wqueue_add_and_wait(WQUEUE *wqueue,
+                         struct st_my_thread_var *thread,
+                         pthread_mutex_t *lock)
+{
+  DBUG_ENTER("wqueue_add_and_wait");
+  DBUG_PRINT("enter",
+             ("thread: 0x%lx  cond: 0x%lx  mutex: 0x%lx",
+              (ulong) thread, (ulong) &thread->suspend, (ulong) lock));
+  wqueue_add_to_queue(wqueue, thread);
+  do
+  {
+    DBUG_PRINT("info", ("wait... cond:  0x%lx  mutex:  0x%lx",
+                        (ulong) &thread->suspend, (ulong) lock));
+    pthread_cond_wait(&thread->suspend, lock);
+    DBUG_PRINT("info", ("wait done cond: 0x%lx  mutex: 0x%lx   next: 0x%lx",
+                        (ulong) &thread->suspend, (ulong) lock,
+                        (ulong) thread->next));
+  }
+  while (thread->next);
+  DBUG_VOID_RETURN;
+}